优化简历提取速度

2026-04-29 15:02:05 +08:00
parent 0a15e320f5
commit b26c0a7262
5 changed files with 247 additions and 127 deletions
@@ -1,63 +1,181 @@
-"""简历并行提取：将完整简历文本拆分为5个AI任务并行提取"""
+"""简历两阶段并行提取
 第一阶段：5路并行提取主表信息 + 各子表标识名（极快，输出极短）。
 第二阶段：N路并行提取每条子表记录的详情，description 用字母编号引用原文。
 最终组装为与原方案完全一致的 dict 结构，上下游无感知。
 """
 import asyncio
 import time
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from app.ai.models import LLM
 from app.ai.resume_extractor.prompts import (
-    PROFILE_PROMPT, EDUCATION_PROMPT, WORK_PROMPT,
+    OVERVIEW_PROFILE_PROMPT, OVERVIEW_EDUCATION_PROMPT, OVERVIEW_WORK_PROMPT,
-    PROJECT_PROMPT, COMPETITION_PROMPT,
+    OVERVIEW_PROJECT_PROMPT, OVERVIEW_COMPETITION_PROMPT,
    DETAIL_PROFILE_PROMPT, DETAIL_EDUCATION_PROMPT, DETAIL_WORK_PROMPT,
    DETAIL_INTERNSHIP_PROMPT, DETAIL_PROJECT_PROMPT, DETAIL_COMPETITION_PROMPT,
 )
 from app.core.logger import log
 from app.tool.json_helper import parse_llm_json
 _LLM_MODEL = LLM.DOUBAO_PRO_32K
 # ==================== 文本编号 ====================
 def _gen_alpha(n: int):
    """生成 n 个字母编号：a,b,...,z,aa,ab,...,az,ba,..."""
    for i in range(n):
        yield chr(ord('a') + i) if i < 26 else chr(ord('a') + (i // 26 - 1)) + chr(ord('a') + i % 26)
 def _number_lines(text: str) -> tuple[dict[str, str], str]:
    """按换行分割、过滤空行、字母编号，返回 (字母→原文dict, 带编号文本)"""
    raw_lines = [line for line in text.split("\n") if line.strip()]
    alphas = list(_gen_alpha(len(raw_lines)))
    line_map = dict(zip(alphas, raw_lines))
    numbered = "\n".join(f"[{a}] {line}" for a, line in zip(alphas, raw_lines))
    return line_map, numbered
 def _resolve_desc(line_map: dict[str, str], desc_str: str | None) -> list[str]:
    """将逗号分隔的字母编号字符串解析为原文列表"""
    if not desc_str or not isinstance(desc_str, str):
        return []
    keys = [k.strip() for k in desc_str.split(",") if k.strip()]
    return [line_map[k] for k in keys if k in line_map]
 # ==================== LLM 调用工具 ====================
 def _build_chain(prompt: str):
-    """构建单个提取链：prompt → LLM → 文本输出"""
+    """构建提取链：prompt → LLM → 文本输出"""
-    return (
+    return ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")]) | _LLM_MODEL.create(temperature=0) | StrOutputParser()
        ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")])
        | LLM.JIAYU_CLAUDE_SONNET_4_5.create(temperature=0)
        | StrOutputParser()
    )
 # 5 条独立的提取链
 _profile_chain = _build_chain(PROFILE_PROMPT)
 _education_chain = _build_chain(EDUCATION_PROMPT)
 _work_chain = _build_chain(WORK_PROMPT)
 _project_chain = _build_chain(PROJECT_PROMPT)
 _competition_chain = _build_chain(COMPETITION_PROMPT)
 async def extract_all(text: str) -> dict:
    """asyncio.gather 并行提取简历所有模块，返回合并后的结构化数据"""
    log.info("开始5路并行AI提取")
    inp = {"text": text}
    profile, education, work_intern, project, competition = await asyncio.gather(
        _safe_invoke(_profile_chain, inp, "个人信息"),
        _safe_invoke(_education_chain, inp, "教育经历"),
        _safe_invoke(_work_chain, inp, "工作+实习经历"),
        _safe_invoke(_project_chain, inp, "项目经历"),
        _safe_invoke(_competition_chain, inp, "竞赛经历"),
    )
    result = profile if isinstance(profile, dict) else {}
    result["education"] = education if isinstance(education, list) else []
    result["work"] = work_intern.get("work", []) if isinstance(work_intern, dict) else []
    result["internship"] = work_intern.get("internship", []) if isinstance(work_intern, dict) else []
    result["project"] = project if isinstance(project, list) else []
    result["competition"] = competition if isinstance(competition, list) else []
    return result
 async def _safe_invoke(chain, inp: dict, label: str):
-    """单个链调用，失败返回空"""
+    """单个链调用，记录耗时，失败返回空"""
    start = time.perf_counter()
    try:
        raw = await chain.ainvoke(inp)
        log.info(f"AI提取[{label}]完成，耗时: {time.perf_counter() - start:.2f}s")
        return parse_llm_json(raw)
    except Exception as e:
-        log.warning(f"AI提取[{label}]失败: {e}")
+        log.warning(f"AI提取[{label}]失败，耗时: {time.perf_counter() - start:.2f}s，错误: {e}")
-        return {} if "个人信息" in label else []
+        return None
 # ==================== 第一阶段：概览 ====================
 _overview_profile_chain = _build_chain(OVERVIEW_PROFILE_PROMPT)
 _overview_education_chain = _build_chain(OVERVIEW_EDUCATION_PROMPT)
 _overview_work_chain = _build_chain(OVERVIEW_WORK_PROMPT)
 _overview_project_chain = _build_chain(OVERVIEW_PROJECT_PROMPT)
 _overview_competition_chain = _build_chain(OVERVIEW_COMPETITION_PROMPT)
 async def _extract_overview(numbered_text: str) -> dict:
    """第一阶段：5路并行提取概览信息"""
    inp = {"text": numbered_text}
    profile, edu_names, work_names, proj_names, comp_names = await asyncio.gather(
        _safe_invoke(_overview_profile_chain, inp, "概览-个人信息"),
        _safe_invoke(_overview_education_chain, inp, "概览-教育"),
        _safe_invoke(_overview_work_chain, inp, "概览-工作实习"),
        _safe_invoke(_overview_project_chain, inp, "概览-项目"),
        _safe_invoke(_overview_competition_chain, inp, "概览-竞赛"),
    )
    return {
        "profile": profile if isinstance(profile, dict) else {},
        "education": edu_names if isinstance(edu_names, list) else [],
        "work": work_names.get("work", []) if isinstance(work_names, dict) else [],
        "internship": work_names.get("internship", []) if isinstance(work_names, dict) else [],
        "project": proj_names if isinstance(proj_names, list) else [],
        "competition": comp_names if isinstance(comp_names, list) else [],
    }
 # ==================== 第二阶段：详情 ====================
 async def _extract_detail(prompt_tpl: str, name: str, numbered_text: str, label: str) -> dict | None:
    """单条子表记录详情提取：用 name 替换 prompt 中的 {name}，发送带编号全文"""
    prompt = prompt_tpl.replace("{name}", name)
    chain = _build_chain(prompt)
    return await _safe_invoke(chain, {"text": numbered_text}, label)
 async def _extract_all_details(overview: dict, numbered_text: str) -> dict:
    """第二阶段：根据概览结果，N路并行提取所有子表记录详情 + 个人信息的skills/certificates/summary"""
    tasks: list = []
    task_meta: list[tuple[str, int]] = []  # (模块名, 索引) 用于结果归位
    # profile 的 skills/certificates/summaryLines
    tasks.append(_extract_detail(DETAIL_PROFILE_PROMPT, "", numbered_text, "详情-个人信息补充"))
    task_meta.append(("profile_extra", 0))
    for i, name in enumerate(overview["education"]):
        tasks.append(_extract_detail(DETAIL_EDUCATION_PROMPT, name, numbered_text, f"详情-教育-{name}"))
        task_meta.append(("education", i))
    for i, name in enumerate(overview["work"]):
        tasks.append(_extract_detail(DETAIL_WORK_PROMPT, name, numbered_text, f"详情-工作-{name}"))
        task_meta.append(("work", i))
    for i, name in enumerate(overview["internship"]):
        tasks.append(_extract_detail(DETAIL_INTERNSHIP_PROMPT, name, numbered_text, f"详情-实习-{name}"))
        task_meta.append(("internship", i))
    for i, name in enumerate(overview["project"]):
        tasks.append(_extract_detail(DETAIL_PROJECT_PROMPT, name, numbered_text, f"详情-项目-{name}"))
        task_meta.append(("project", i))
    for i, name in enumerate(overview["competition"]):
        tasks.append(_extract_detail(DETAIL_COMPETITION_PROMPT, name, numbered_text, f"详情-竞赛-{name}"))
        task_meta.append(("competition", i))
    results = await asyncio.gather(*tasks)
    details: dict[str, list] = {"profile_extra": [], "education": [], "work": [], "internship": [], "project": [], "competition": []}
    for (module, _idx), result in zip(task_meta, results):
        details[module].append(result if isinstance(result, dict) else {})
    return details
 # ==================== 组装 ====================
 def _assemble(overview: dict, details: dict, line_map: dict[str, str]) -> dict:
    """将两阶段结果组装为与原方案一致的 dict 结构"""
    profile = overview["profile"]
    # 合并第二阶段提取的 skills/certificates/summaryLines
    profile_extra = details.get("profile_extra", [{}])[0] if details.get("profile_extra") else {}
    profile["skills"] = profile_extra.get("skills") or []
    profile["certificates"] = profile_extra.get("certificates") or []
    summary_str = profile_extra.get("summaryLines")
    summary_texts = _resolve_desc(line_map, summary_str)
    profile["summary"] = "\n".join(summary_texts) if summary_texts else None
    result = dict(profile)
    for module in ("education", "work", "internship", "project", "competition"):
        items = []
        for item in details.get(module, []):
            desc_str = item.pop("descLines", None)
            item["description"] = _resolve_desc(line_map, desc_str)
            items.append(item)
        result[module] = items
    return result
 # ==================== 入口 ====================
 async def extract_all(text: str) -> dict:
    """两阶段并行提取简历，返回与原方案完全一致的结构化数据"""
    line_map, numbered_text = _number_lines(text)
    log.info(f"文本编号完成，共 {len(line_map)} 行")
    log.info("第一阶段：5路并行概览提取")
    overview = await _extract_overview(numbered_text)
    log.info(f"概览完成 - 教育:{len(overview['education'])} 工作:{len(overview['work'])} 实习:{len(overview['internship'])} 项目:{len(overview['project'])} 竞赛:{len(overview['competition'])}")
    total = sum(len(overview[m]) for m in ("education", "work", "internship", "project", "competition"))
    log.info(f"第二阶段：{total}路并行详情提取")
    details = await _extract_all_details(overview, numbered_text)
    result = _assemble(overview, details, line_map)
    log.info("两阶段提取完成，数据组装完毕")
    return result
@@ -1,79 +1,87 @@
-"""简历各模块提取的 System Prompt
+"""简历两阶段提取 Prompt
-注意：prompt 中的 JSON 示例花括号必须用 {{ }} 转义，避免被 ChatPromptTemplate 当作变量。
+第一阶段（概览）：5路并行，只提取主表短字段和子表标识名，不提取 description。
 第二阶段（详情）：N路并行，每条子表记录单独提取全部字段，description 用字母编号引用。
 花括号用 {{ }} 转义，避免被 ChatPromptTemplate 当作变量。{name} 为运行时替换的记录标识名。
 """
-PROFILE_PROMPT = """从简历文本中仅提取个人基本信息，原文提取不要改写，输出JSON：
+# ==================== 第一阶段：概览提取 ====================
 ```json
 {{
  "name": "姓名",
  "email": "邮箱",
  "mobileNumber": "手机号",
  "city": "所在城市",
  "wechatNumber": "微信号",
  "portfolioUrl": "作品集链接",
  "skills": ["技能1"],
  "certificates": ["证书1"],
  "summary": "个人概述原文"
 }}
 ```
 规则：只提取个人信息，不提取经历内容。summary只填"自我评价/个人概述"原文。没有的填null，数组填[]。只输出JSON。"""
-EDUCATION_PROMPT = """从简历文本中仅提取教育经历，原文提取不要改写，输出JSON数组：
+OVERVIEW_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造，没有的填null。
 从中仅提取个人基本信息（不含技能、证书、自我评价），输出JSON：
 ```json
-[{{
+{{ "name": "姓名", "email": "邮箱", "mobileNumber": "手机号", "city": "所在城市", "wechatNumber": "微信号", "portfolioUrl": "作品集链接" }}
  "school": "学校",
  "major": "专业",
  "degree": "学历",
  "studyType": "全日制/非全日制",
  "startDate": "2020.09",
  "endDate": "2024.06",
  "description": ["原文段落"]
 }}]
 ```
-规则：只提取教育经历，不提取工作/实习/项目/竞赛。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
+规则：只提取以上6个字段，不提取skills/certificates/summary/经历。没有的填null。只输出JSON。"""
-WORK_PROMPT = """从简历文本中仅提取工作经历和实习经历，原文提取不要改写，输出JSON：
+OVERVIEW_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 从中仅提取教育经历的学校名称列表，输出JSON数组：
 ```json
-{{
+["北京大学", "清华大学"]
  "work": [{{
    "companyName": "公司",
    "position": "职位",
    "startDate": "2024.07",
    "endDate": "2025.03",
    "description": ["原文段落"]
  }}],
  "internship": [{{
    "companyName": "公司",
    "position": "职位",
    "startDate": "2023.06",
    "endDate": "2023.09",
    "description": ["原文段落"]
  }}]
 }}
 ```
-规则：标注"实习"的归internship，其余归work。不提取项目/教育/竞赛。时间格式YYYY.MM。没有填[]。只输出JSON。"""
+规则：只提取学校名称，不提取其他字段。没有输出[]。只输出JSON。"""
-PROJECT_PROMPT = """从简历文本中仅提取项目经历，原文提取不要改写，输出JSON数组：
+OVERVIEW_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 从中仅提取工作经历和实习经历的公司名称列表，输出JSON：
 ```json
-[{{
+{{ "work": ["阿里巴巴", "腾讯"], "internship": ["字节跳动"] }}
  "companyName": "所属公司",
  "projectName": "项目名",
  "role": "角色名称（如：后端开发、项目经理、前端工程师，只填角色名不填职责描述）",
  "startDate": "2023.03",
  "endDate": "2023.12",
  "description": ["原文段落"]
 }}]
 ```
-规则：只提取项目经历，不提取工作/实习/教育/竞赛。role只填简短角色名，职责内容放description。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
+规则：标注"实习"的归internship，其余归work。只提取公司名称。没有填[]。只输出JSON。"""
-COMPETITION_PROMPT = """从简历文本中仅提取竞赛/获奖经历，原文提取不要改写，输出JSON数组：
+OVERVIEW_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 从中仅提取项目经历的项目名称列表，输出JSON数组：
 ```json
-[{{
+["订单系统重构", "支付网关"]
  "competitionName": "竞赛名",
  "award": "获奖情况",
  "awardDate": "2023.07",
  "description": ["原文段落"]
 }}]
 ```
-规则：只提取竞赛获奖，不提取其他经历。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
+规则：只提取项目名称，不提取其他字段。没有输出[]。只输出JSON。"""
 OVERVIEW_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 从中仅提取竞赛/获奖经历的竞赛名称列表，输出JSON数组：
 ```json
 ["ACM区域赛", "数学建模大赛"]
 ```
 规则：只提取竞赛名称，不提取其他字段。没有输出[]。只输出JSON。"""
 # ==================== 第二阶段：详情提取 ====================
 DETAIL_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 请提取"{name}"这条教育经历的详细信息，输出JSON：
 ```json
 {{ "school": "学校", "major": "专业", "degree": "学历", "studyType": "全日制/非全日制", "startDate": "2020.09", "endDate": "2024.06", "descLines": "e,f,g" }}
 ```
 规则：descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
 DETAIL_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 请提取"{name}"这条工作经历的详细信息，输出JSON：
 ```json
 {{ "companyName": "公司", "position": "职位", "startDate": "2024.07", "endDate": "2025.03", "descLines": "h,i,j" }}
 ```
 规则：descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
 DETAIL_INTERNSHIP_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 请提取"{name}"这条实习经历的详细信息，输出JSON：
 ```json
 {{ "companyName": "公司", "position": "职位", "startDate": "2023.06", "endDate": "2023.09", "descLines": "p,q,r" }}
 ```
 规则：descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
 DETAIL_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 请提取"{name}"这条项目经历的详细信息，输出JSON：
 ```json
 {{ "companyName": "所属公司", "projectName": "项目名", "role": "角色名称", "startDate": "2023.03", "endDate": "2023.12", "descLines": "u,v,w" }}
 ```
 规则：role只填简短角色名。descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
 DETAIL_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 请提取"{name}"这条竞赛/获奖经历的详细信息，输出JSON：
 ```json
 {{ "competitionName": "竞赛名", "award": "获奖情况", "awardDate": "2023.07", "descLines": "ae,af" }}
 ```
 规则：descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
 DETAIL_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
 从中提取技能标签、证书和自我评价/个人概述，输出JSON：
 ```json
 {{ "skills": ["技能1"], "certificates": ["证书1"], "summaryLines": "k,l,m" }}
 ```
 规则：skills填技能标签数组。certificates填证书数组。summaryLines填自我评价/个人概述对应的字母编号逗号分隔。没有的填null，数组填[]。只输出JSON。"""
@@ -1,7 +1,7 @@
 """简历解析 Service
-上传简历文件 → 解析为纯文本 → AI 并行结构化 → 写入数据库。
+上传简历文件 → 解析为纯文本 → AI 两阶段并行结构化 → 写入数据库。
-依赖：file_parser（文件解析工具）、resume_extractor（AI并行提取）
+依赖：file_parser（文件解析工具）、resume_extractor（AI两阶段并行提取）
 使用表：bg_user_resume（主表）、bg_user_resume_education/work/internship/project/competition（5张子表）
 """
@@ -25,16 +25,16 @@ from app.tool.snowflake import next_id
 class ResumeParseService:
    async def parse_and_extract(self, filename: str, content: bytes) -> dict:
-        """文件解析 + AI 并行结构化，不涉及数据库操作"""
+        """文件解析 + AI 两阶段并行结构化，不涉及数据库操作"""
        log.info(f"开始解析简历文件: {filename}")
        text = await asyncio.to_thread(parse_to_text, filename, content)
        if not text or not text.strip():
            raise ValueError("文件内容为空，无法解析")
        log.info(f"文件解析完成，文本长度: {len(text)}")
-        log.info("开始AI并行结构化提取")
+        log.info("开始AI两阶段并行结构化提取")
        parsed = await extract_all(text)
-        log.info("AI并行结构化提取完成")
+        log.info("AI两阶段并行结构化提取完成")
        return parsed
    async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int:
@@ -1,24 +1,25 @@
 """文件解析工具
 将上传的简历文件（PDF / Word / TXT）转换为纯文本字符串。
 PDF 使用 PyMuPDF (fitz) 按文本块提取，保持段落边界和阅读顺序。
 """
 import io
-import pdfplumber
+import fitz
 from docx import Document
 from app.core.logger import log
 def parse_pdf(content: bytes) -> str:
-    """解析 PDF 文件，提取全部页面文本"""
+    """解析 PDF 文件，按文本块提取，过滤图片块，保持阅读顺序"""
    text_parts: list[str] = []
-    with pdfplumber.open(io.BytesIO(content)) as pdf:
+    with fitz.open(stream=content, filetype="pdf") as doc:
-        for page in pdf.pages:
+        for page in doc:
-            page_text = page.extract_text()
+            for b in page.get_text("blocks", sort=True):
-            if page_text:
+                if b[6] == 0 and b[4].strip():  # type 0=文本块, 1=图片块
-                text_parts.append(page_text)
+                    text_parts.append(b[4].strip())
    return "\n".join(text_parts)
@@ -28,22 +29,16 @@ def parse_docx(content: bytes) -> str:
        doc = Document(io.BytesIO(content))
    except Exception:
        raise ValueError("无法解析该 Word 文件，如果是旧版 .doc 格式，请另存为 .docx 后重试")
    text_parts: list[str] = []
    # 段落
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            text_parts.append(text)
    # 表格
    for table in doc.tables:
        for row in table.rows:
            row_text = "\t".join(cell.text.strip() for cell in row.cells)
            if row_text.strip():
                text_parts.append(row_text)
    return "\n".join(text_parts)
@@ -61,7 +56,6 @@ def parse_to_text(filename: str, content: bytes) -> str:
    """根据文件名后缀自动选择解析方法，返回纯文本"""
    suffix = filename[filename.rfind("."):].lower() if "." in filename else ""
    log.info(f"解析文件: {filename}，类型: {suffix}")
    if suffix == ".pdf":
        return parse_pdf(content)
    elif suffix in (".docx", ".doc"):
@@ -43,7 +43,7 @@ python-multipart>=0.0.9
 python-dotenv>=1.0.0
 # 文件解析
-pdfplumber>=0.11.0
+pymupdf>=1.24.0
 python-docx>=1.1.0
 # 雪花ID