generated from kgod/ai-review-template
156 lines
5.2 KiB
Python
156 lines
5.2 KiB
Python
"""Excel 公司名单导入"""
|
||
import logging
|
||
from pathlib import Path
|
||
|
||
from openpyxl import load_workbook
|
||
|
||
from src.database import SessionLocal, CrawlTask, TaskSearch
|
||
from src.company_importer.url_validator import extract_url, validate_url, clean_url
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def import_companies(
|
||
file_path: str,
|
||
company_col: int = 1,
|
||
url_col: int = 2,
|
||
) -> dict:
|
||
"""
|
||
从 Excel 导入公司名单并创建爬虫任务
|
||
|
||
Args:
|
||
file_path: Excel 文件完整路径(.xlsx)
|
||
company_col: 公司名所在列索引(从1开始)
|
||
url_col: 招聘地址所在列索引(从1开始)
|
||
|
||
Returns:
|
||
导入统计结果
|
||
"""
|
||
# 1. 验证文件
|
||
path = Path(file_path)
|
||
if not path.exists():
|
||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||
if path.suffix.lower() != ".xlsx":
|
||
raise ValueError(f"仅支持 .xlsx 格式,当前文件: {path.suffix}")
|
||
|
||
# 2. 读取 Excel
|
||
wb = load_workbook(file_path, read_only=False) # read_only=False 以读取 hyperlink
|
||
ws = wb.active
|
||
|
||
# 验证列索引
|
||
max_col = ws.max_column or 0
|
||
if company_col > max_col or url_col > max_col:
|
||
wb.close()
|
||
raise ValueError(f"列索引超出范围,表格共{max_col}列,company_col={company_col}, url_col={url_col}")
|
||
|
||
# 统计计数
|
||
stats = {
|
||
"total_rows": 0,
|
||
"inserted_count": 0,
|
||
"skipped_empty_name": 0,
|
||
"skipped_empty_url": 0,
|
||
"skipped_email": 0,
|
||
"skipped_weixin": 0,
|
||
"skipped_invalid_url": 0,
|
||
"skipped_duplicate": 0,
|
||
"inserted_companies": [],
|
||
}
|
||
|
||
# 3. 查询已有公司名(内存去重)
|
||
db = SessionLocal()
|
||
try:
|
||
existing_rows = db.query(CrawlTask.company_name).all()
|
||
existing_set = {row[0] for row in existing_rows}
|
||
|
||
# 4. 逐行处理(跳过表头)
|
||
for row_idx, row in enumerate(ws.iter_rows(min_row=2), start=2):
|
||
stats["total_rows"] += 1
|
||
|
||
# 4.1 读取公司名
|
||
company_cell = row[company_col - 1]
|
||
company_name = str(company_cell.value).strip() if company_cell.value else ""
|
||
|
||
if not company_name:
|
||
stats["skipped_empty_name"] += 1
|
||
logger.debug(f"第{row_idx}行: 公司名为空,跳过")
|
||
continue
|
||
|
||
# 4.2 读取并提取 URL
|
||
url_cell = row[url_col - 1]
|
||
raw_url = extract_url(url_cell)
|
||
|
||
if not raw_url:
|
||
stats["skipped_empty_url"] += 1
|
||
logger.debug(f"第{row_idx}行: {company_name} 地址为空,跳过")
|
||
continue
|
||
|
||
# 4.3 验证 URL
|
||
is_valid, reason = validate_url(raw_url)
|
||
if not is_valid:
|
||
if reason == "email":
|
||
stats["skipped_email"] += 1
|
||
elif reason == "weixin":
|
||
stats["skipped_weixin"] += 1
|
||
elif reason == "empty_url":
|
||
stats["skipped_empty_url"] += 1
|
||
else:
|
||
stats["skipped_invalid_url"] += 1
|
||
logger.debug(f"第{row_idx}行: {company_name} 地址无效({reason}): {raw_url}")
|
||
continue
|
||
|
||
# 4.4 清理 URL
|
||
url = clean_url(raw_url)
|
||
|
||
# 4.5 内存去重
|
||
if company_name in existing_set:
|
||
stats["skipped_duplicate"] += 1
|
||
logger.debug(f"第{row_idx}行: {company_name} 已存在,跳过")
|
||
continue
|
||
|
||
# 4.6 单条事务入库
|
||
try:
|
||
# 插入主表
|
||
crawl_task = CrawlTask(company_name=company_name)
|
||
db.add(crawl_task)
|
||
db.flush()
|
||
|
||
# 插入 Step1 任务(带 input_url)
|
||
task_search = TaskSearch(
|
||
crawl_task_id=crawl_task.id,
|
||
input_company_name=company_name,
|
||
input_url=url,
|
||
)
|
||
db.add(task_search)
|
||
|
||
db.commit()
|
||
stats["inserted_count"] += 1
|
||
stats["inserted_companies"].append(company_name)
|
||
existing_set.add(company_name)
|
||
|
||
except Exception as e:
|
||
db.rollback()
|
||
stats["skipped_duplicate"] += 1
|
||
logger.warning(f"第{row_idx}行: {company_name} 插入失败: {e}")
|
||
continue
|
||
|
||
logger.info(
|
||
f"导入完成: 总{stats['total_rows']}行, "
|
||
f"成功{stats['inserted_count']}条, "
|
||
f"空名{stats['skipped_empty_name']}, "
|
||
f"空地址{stats['skipped_empty_url']}, "
|
||
f"邮箱{stats['skipped_email']}, "
|
||
f"微信{stats['skipped_weixin']}, "
|
||
f"无效{stats['skipped_invalid_url']}, "
|
||
f"重复{stats['skipped_duplicate']}"
|
||
)
|
||
|
||
return stats
|
||
|
||
except Exception as e:
|
||
logger.error(f"导入异常: {e}")
|
||
db.rollback()
|
||
raise
|
||
finally:
|
||
db.close()
|
||
wb.close()
|