Files
post_crawler/src/company_importer/importer.py
T
2026-05-26 21:02:17 +08:00

156 lines
5.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Excel 公司名单导入"""
import logging
from pathlib import Path
from openpyxl import load_workbook
from src.database import SessionLocal, CrawlTask, TaskSearch
from src.company_importer.url_validator import extract_url, validate_url, clean_url
logger = logging.getLogger(__name__)
def import_companies(
file_path: str,
company_col: int = 1,
url_col: int = 2,
) -> dict:
"""
从 Excel 导入公司名单并创建爬虫任务
Args:
file_path: Excel 文件完整路径(.xlsx
company_col: 公司名所在列索引(从1开始)
url_col: 招聘地址所在列索引(从1开始)
Returns:
导入统计结果
"""
# 1. 验证文件
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"文件不存在: {file_path}")
if path.suffix.lower() != ".xlsx":
raise ValueError(f"仅支持 .xlsx 格式,当前文件: {path.suffix}")
# 2. 读取 Excel
wb = load_workbook(file_path, read_only=False) # read_only=False 以读取 hyperlink
ws = wb.active
# 验证列索引
max_col = ws.max_column or 0
if company_col > max_col or url_col > max_col:
wb.close()
raise ValueError(f"列索引超出范围,表格共{max_col}列,company_col={company_col}, url_col={url_col}")
# 统计计数
stats = {
"total_rows": 0,
"inserted_count": 0,
"skipped_empty_name": 0,
"skipped_empty_url": 0,
"skipped_email": 0,
"skipped_weixin": 0,
"skipped_invalid_url": 0,
"skipped_duplicate": 0,
"inserted_companies": [],
}
# 3. 查询已有公司名(内存去重)
db = SessionLocal()
try:
existing_rows = db.query(CrawlTask.company_name).all()
existing_set = {row[0] for row in existing_rows}
# 4. 逐行处理(跳过表头)
for row_idx, row in enumerate(ws.iter_rows(min_row=2), start=2):
stats["total_rows"] += 1
# 4.1 读取公司名
company_cell = row[company_col - 1]
company_name = str(company_cell.value).strip() if company_cell.value else ""
if not company_name:
stats["skipped_empty_name"] += 1
logger.debug(f"{row_idx}行: 公司名为空,跳过")
continue
# 4.2 读取并提取 URL
url_cell = row[url_col - 1]
raw_url = extract_url(url_cell)
if not raw_url:
stats["skipped_empty_url"] += 1
logger.debug(f"{row_idx}行: {company_name} 地址为空,跳过")
continue
# 4.3 验证 URL
is_valid, reason = validate_url(raw_url)
if not is_valid:
if reason == "email":
stats["skipped_email"] += 1
elif reason == "weixin":
stats["skipped_weixin"] += 1
elif reason == "empty_url":
stats["skipped_empty_url"] += 1
else:
stats["skipped_invalid_url"] += 1
logger.debug(f"{row_idx}行: {company_name} 地址无效({reason}): {raw_url}")
continue
# 4.4 清理 URL
url = clean_url(raw_url)
# 4.5 内存去重
if company_name in existing_set:
stats["skipped_duplicate"] += 1
logger.debug(f"{row_idx}行: {company_name} 已存在,跳过")
continue
# 4.6 单条事务入库
try:
# 插入主表
crawl_task = CrawlTask(company_name=company_name)
db.add(crawl_task)
db.flush()
# 插入 Step1 任务(带 input_url
task_search = TaskSearch(
crawl_task_id=crawl_task.id,
input_company_name=company_name,
input_url=url,
)
db.add(task_search)
db.commit()
stats["inserted_count"] += 1
stats["inserted_companies"].append(company_name)
existing_set.add(company_name)
except Exception as e:
db.rollback()
stats["skipped_duplicate"] += 1
logger.warning(f"{row_idx}行: {company_name} 插入失败: {e}")
continue
logger.info(
f"导入完成: 总{stats['total_rows']}行, "
f"成功{stats['inserted_count']}条, "
f"空名{stats['skipped_empty_name']}, "
f"空地址{stats['skipped_empty_url']}, "
f"邮箱{stats['skipped_email']}, "
f"微信{stats['skipped_weixin']}, "
f"无效{stats['skipped_invalid_url']}, "
f"重复{stats['skipped_duplicate']}"
)
return stats
except Exception as e:
logger.error(f"导入异常: {e}")
db.rollback()
raise
finally:
db.close()
wb.close()