优化简历提取速度

2026-04-29 15:02:05 +08:00
parent 0a15e320f5
commit b26c0a7262
5 changed files with 247 additions and 127 deletions
@@ -1,63 +1,181 @@
-"""简历并行提取：将完整简历文本拆分为5个AI任务并行提取"""
+"""简历两阶段并行提取
+
+第一阶段：5路并行提取主表信息 + 各子表标识名（极快，输出极短）。
+第二阶段：N路并行提取每条子表记录的详情，description 用字母编号引用原文。
+最终组装为与原方案完全一致的 dict 结构，上下游无感知。
+"""

 import asyncio
+import time

 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate

 from app.ai.models import LLM
 from app.ai.resume_extractor.prompts import (
-    PROFILE_PROMPT, EDUCATION_PROMPT, WORK_PROMPT,
-    PROJECT_PROMPT, COMPETITION_PROMPT,
+    OVERVIEW_PROFILE_PROMPT, OVERVIEW_EDUCATION_PROMPT, OVERVIEW_WORK_PROMPT,
+    OVERVIEW_PROJECT_PROMPT, OVERVIEW_COMPETITION_PROMPT,
+    DETAIL_PROFILE_PROMPT, DETAIL_EDUCATION_PROMPT, DETAIL_WORK_PROMPT,
+    DETAIL_INTERNSHIP_PROMPT, DETAIL_PROJECT_PROMPT, DETAIL_COMPETITION_PROMPT,
 )
 from app.core.logger import log
 from app.tool.json_helper import parse_llm_json

+_LLM_MODEL = LLM.DOUBAO_PRO_32K
+
+
+# ==================== 文本编号 ====================
+
+def _gen_alpha(n: int):
+    """生成 n 个字母编号：a,b,...,z,aa,ab,...,az,ba,..."""
+    for i in range(n):
+        yield chr(ord('a') + i) if i < 26 else chr(ord('a') + (i // 26 - 1)) + chr(ord('a') + i % 26)
+
+
+def _number_lines(text: str) -> tuple[dict[str, str], str]:
+    """按换行分割、过滤空行、字母编号，返回 (字母→原文dict, 带编号文本)"""
+    raw_lines = [line for line in text.split("\n") if line.strip()]
+    alphas = list(_gen_alpha(len(raw_lines)))
+    line_map = dict(zip(alphas, raw_lines))
+    numbered = "\n".join(f"[{a}] {line}" for a, line in zip(alphas, raw_lines))
+    return line_map, numbered
+
+
+def _resolve_desc(line_map: dict[str, str], desc_str: str | None) -> list[str]:
+    """将逗号分隔的字母编号字符串解析为原文列表"""
+    if not desc_str or not isinstance(desc_str, str):
+        return []
+    keys = [k.strip() for k in desc_str.split(",") if k.strip()]
+    return [line_map[k] for k in keys if k in line_map]
+
+
+# ==================== LLM 调用工具 ====================

 def _build_chain(prompt: str):
-    """构建单个提取链：prompt → LLM → 文本输出"""
-    return (
-        ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")])
-        | LLM.JIAYU_CLAUDE_SONNET_4_5.create(temperature=0)
-        | StrOutputParser()
-    )
-
-
-# 5 条独立的提取链
-_profile_chain = _build_chain(PROFILE_PROMPT)
-_education_chain = _build_chain(EDUCATION_PROMPT)
-_work_chain = _build_chain(WORK_PROMPT)
-_project_chain = _build_chain(PROJECT_PROMPT)
-_competition_chain = _build_chain(COMPETITION_PROMPT)
-
-
-async def extract_all(text: str) -> dict:
-    """asyncio.gather 并行提取简历所有模块，返回合并后的结构化数据"""
-    log.info("开始5路并行AI提取")
-    inp = {"text": text}
-
-    profile, education, work_intern, project, competition = await asyncio.gather(
-        _safe_invoke(_profile_chain, inp, "个人信息"),
-        _safe_invoke(_education_chain, inp, "教育经历"),
-        _safe_invoke(_work_chain, inp, "工作+实习经历"),
-        _safe_invoke(_project_chain, inp, "项目经历"),
-        _safe_invoke(_competition_chain, inp, "竞赛经历"),
-    )
-
-    result = profile if isinstance(profile, dict) else {}
-    result["education"] = education if isinstance(education, list) else []
-    result["work"] = work_intern.get("work", []) if isinstance(work_intern, dict) else []
-    result["internship"] = work_intern.get("internship", []) if isinstance(work_intern, dict) else []
-    result["project"] = project if isinstance(project, list) else []
-    result["competition"] = competition if isinstance(competition, list) else []
-    return result
+    """构建提取链：prompt → LLM → 文本输出"""
+    return ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")]) | _LLM_MODEL.create(temperature=0) | StrOutputParser()


 async def _safe_invoke(chain, inp: dict, label: str):
-    """单个链调用，失败返回空"""
+    """单个链调用，记录耗时，失败返回空"""
+    start = time.perf_counter()
    try:
        raw = await chain.ainvoke(inp)
+        log.info(f"AI提取[{label}]完成，耗时: {time.perf_counter() - start:.2f}s")
        return parse_llm_json(raw)
    except Exception as e:
-        log.warning(f"AI提取[{label}]失败: {e}")
-        return {} if "个人信息" in label else []
+        log.warning(f"AI提取[{label}]失败，耗时: {time.perf_counter() - start:.2f}s，错误: {e}")
+        return None
+
+
+# ==================== 第一阶段：概览 ====================
+
+_overview_profile_chain = _build_chain(OVERVIEW_PROFILE_PROMPT)
+_overview_education_chain = _build_chain(OVERVIEW_EDUCATION_PROMPT)
+_overview_work_chain = _build_chain(OVERVIEW_WORK_PROMPT)
+_overview_project_chain = _build_chain(OVERVIEW_PROJECT_PROMPT)
+_overview_competition_chain = _build_chain(OVERVIEW_COMPETITION_PROMPT)
+
+
+async def _extract_overview(numbered_text: str) -> dict:
+    """第一阶段：5路并行提取概览信息"""
+    inp = {"text": numbered_text}
+    profile, edu_names, work_names, proj_names, comp_names = await asyncio.gather(
+        _safe_invoke(_overview_profile_chain, inp, "概览-个人信息"),
+        _safe_invoke(_overview_education_chain, inp, "概览-教育"),
+        _safe_invoke(_overview_work_chain, inp, "概览-工作实习"),
+        _safe_invoke(_overview_project_chain, inp, "概览-项目"),
+        _safe_invoke(_overview_competition_chain, inp, "概览-竞赛"),
+    )
+    return {
+        "profile": profile if isinstance(profile, dict) else {},
+        "education": edu_names if isinstance(edu_names, list) else [],
+        "work": work_names.get("work", []) if isinstance(work_names, dict) else [],
+        "internship": work_names.get("internship", []) if isinstance(work_names, dict) else [],
+        "project": proj_names if isinstance(proj_names, list) else [],
+        "competition": comp_names if isinstance(comp_names, list) else [],
+    }
+
+
+# ==================== 第二阶段：详情 ====================
+
+async def _extract_detail(prompt_tpl: str, name: str, numbered_text: str, label: str) -> dict | None:
+    """单条子表记录详情提取：用 name 替换 prompt 中的 {name}，发送带编号全文"""
+    prompt = prompt_tpl.replace("{name}", name)
+    chain = _build_chain(prompt)
+    return await _safe_invoke(chain, {"text": numbered_text}, label)
+
+
+async def _extract_all_details(overview: dict, numbered_text: str) -> dict:
+    """第二阶段：根据概览结果，N路并行提取所有子表记录详情 + 个人信息的skills/certificates/summary"""
+    tasks: list = []
+    task_meta: list[tuple[str, int]] = []  # (模块名, 索引) 用于结果归位
+
+    # profile 的 skills/certificates/summaryLines
+    tasks.append(_extract_detail(DETAIL_PROFILE_PROMPT, "", numbered_text, "详情-个人信息补充"))
+    task_meta.append(("profile_extra", 0))
+
+    for i, name in enumerate(overview["education"]):
+        tasks.append(_extract_detail(DETAIL_EDUCATION_PROMPT, name, numbered_text, f"详情-教育-{name}"))
+        task_meta.append(("education", i))
+    for i, name in enumerate(overview["work"]):
+        tasks.append(_extract_detail(DETAIL_WORK_PROMPT, name, numbered_text, f"详情-工作-{name}"))
+        task_meta.append(("work", i))
+    for i, name in enumerate(overview["internship"]):
+        tasks.append(_extract_detail(DETAIL_INTERNSHIP_PROMPT, name, numbered_text, f"详情-实习-{name}"))
+        task_meta.append(("internship", i))
+    for i, name in enumerate(overview["project"]):
+        tasks.append(_extract_detail(DETAIL_PROJECT_PROMPT, name, numbered_text, f"详情-项目-{name}"))
+        task_meta.append(("project", i))
+    for i, name in enumerate(overview["competition"]):
+        tasks.append(_extract_detail(DETAIL_COMPETITION_PROMPT, name, numbered_text, f"详情-竞赛-{name}"))
+        task_meta.append(("competition", i))
+
+    results = await asyncio.gather(*tasks)
+    details: dict[str, list] = {"profile_extra": [], "education": [], "work": [], "internship": [], "project": [], "competition": []}
+    for (module, _idx), result in zip(task_meta, results):
+        details[module].append(result if isinstance(result, dict) else {})
+    return details
+
+
+# ==================== 组装 ====================
+
+def _assemble(overview: dict, details: dict, line_map: dict[str, str]) -> dict:
+    """将两阶段结果组装为与原方案一致的 dict 结构"""
+    profile = overview["profile"]
+    # 合并第二阶段提取的 skills/certificates/summaryLines
+    profile_extra = details.get("profile_extra", [{}])[0] if details.get("profile_extra") else {}
+    profile["skills"] = profile_extra.get("skills") or []
+    profile["certificates"] = profile_extra.get("certificates") or []
+    summary_str = profile_extra.get("summaryLines")
+    summary_texts = _resolve_desc(line_map, summary_str)
+    profile["summary"] = "\n".join(summary_texts) if summary_texts else None
+    result = dict(profile)
+    for module in ("education", "work", "internship", "project", "competition"):
+        items = []
+        for item in details.get(module, []):
+            desc_str = item.pop("descLines", None)
+            item["description"] = _resolve_desc(line_map, desc_str)
+            items.append(item)
+        result[module] = items
+    return result
+
+
+# ==================== 入口 ====================
+
+async def extract_all(text: str) -> dict:
+    """两阶段并行提取简历，返回与原方案完全一致的结构化数据"""
+    line_map, numbered_text = _number_lines(text)
+    log.info(f"文本编号完成，共 {len(line_map)} 行")
+
+    log.info("第一阶段：5路并行概览提取")
+    overview = await _extract_overview(numbered_text)
+    log.info(f"概览完成 - 教育:{len(overview['education'])} 工作:{len(overview['work'])} 实习:{len(overview['internship'])} 项目:{len(overview['project'])} 竞赛:{len(overview['competition'])}")
+
+    total = sum(len(overview[m]) for m in ("education", "work", "internship", "project", "competition"))
+    log.info(f"第二阶段：{total}路并行详情提取")
+    details = await _extract_all_details(overview, numbered_text)
+
+    result = _assemble(overview, details, line_map)
+    log.info("两阶段提取完成，数据组装完毕")
+    return result
@@ -1,79 +1,87 @@
-"""简历各模块提取的 System Prompt
+"""简历两阶段提取 Prompt

-注意：prompt 中的 JSON 示例花括号必须用 {{ }} 转义，避免被 ChatPromptTemplate 当作变量。
+第一阶段（概览）：5路并行，只提取主表短字段和子表标识名，不提取 description。
+第二阶段（详情）：N路并行，每条子表记录单独提取全部字段，description 用字母编号引用。
+花括号用 {{ }} 转义，避免被 ChatPromptTemplate 当作变量。{name} 为运行时替换的记录标识名。
 """

-PROFILE_PROMPT = """从简历文本中仅提取个人基本信息，原文提取不要改写，输出JSON：
-```json
-{{
-  "name": "姓名",
-  "email": "邮箱",
-  "mobileNumber": "手机号",
-  "city": "所在城市",
-  "wechatNumber": "微信号",
-  "portfolioUrl": "作品集链接",
-  "skills": ["技能1"],
-  "certificates": ["证书1"],
-  "summary": "个人概述原文"
-}}
-```
-规则：只提取个人信息，不提取经历内容。summary只填"自我评价/个人概述"原文。没有的填null，数组填[]。只输出JSON。"""
+# ==================== 第一阶段：概览提取 ====================

-EDUCATION_PROMPT = """从简历文本中仅提取教育经历，原文提取不要改写，输出JSON数组：
+OVERVIEW_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造，没有的填null。
+从中仅提取个人基本信息（不含技能、证书、自我评价），输出JSON：
 ```json
-[{{
-  "school": "学校",
-  "major": "专业",
-  "degree": "学历",
-  "studyType": "全日制/非全日制",
-  "startDate": "2020.09",
-  "endDate": "2024.06",
-  "description": ["原文段落"]
-}}]
+{{ "name": "姓名", "email": "邮箱", "mobileNumber": "手机号", "city": "所在城市", "wechatNumber": "微信号", "portfolioUrl": "作品集链接" }}
 ```
-规则：只提取教育经历，不提取工作/实习/项目/竞赛。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
+规则：只提取以上6个字段，不提取skills/certificates/summary/经历。没有的填null。只输出JSON。"""

-WORK_PROMPT = """从简历文本中仅提取工作经历和实习经历，原文提取不要改写，输出JSON：
+OVERVIEW_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+从中仅提取教育经历的学校名称列表，输出JSON数组：
 ```json
-{{
-  "work": [{{
-    "companyName": "公司",
-    "position": "职位",
-    "startDate": "2024.07",
-    "endDate": "2025.03",
-    "description": ["原文段落"]
-  }}],
-  "internship": [{{
-    "companyName": "公司",
-    "position": "职位",
-    "startDate": "2023.06",
-    "endDate": "2023.09",
-    "description": ["原文段落"]
-  }}]
-}}
+["北京大学", "清华大学"]
 ```
-规则：标注"实习"的归internship，其余归work。不提取项目/教育/竞赛。时间格式YYYY.MM。没有填[]。只输出JSON。"""
+规则：只提取学校名称，不提取其他字段。没有输出[]。只输出JSON。"""

-PROJECT_PROMPT = """从简历文本中仅提取项目经历，原文提取不要改写，输出JSON数组：
+OVERVIEW_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+从中仅提取工作经历和实习经历的公司名称列表，输出JSON：
 ```json
-[{{
-  "companyName": "所属公司",
-  "projectName": "项目名",
-  "role": "角色名称（如：后端开发、项目经理、前端工程师，只填角色名不填职责描述）",
-  "startDate": "2023.03",
-  "endDate": "2023.12",
-  "description": ["原文段落"]
-}}]
+{{ "work": ["阿里巴巴", "腾讯"], "internship": ["字节跳动"] }}
 ```
-规则：只提取项目经历，不提取工作/实习/教育/竞赛。role只填简短角色名，职责内容放description。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
+规则：标注"实习"的归internship，其余归work。只提取公司名称。没有填[]。只输出JSON。"""

-COMPETITION_PROMPT = """从简历文本中仅提取竞赛/获奖经历，原文提取不要改写，输出JSON数组：
+OVERVIEW_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+从中仅提取项目经历的项目名称列表，输出JSON数组：
 ```json
-[{{
-  "competitionName": "竞赛名",
-  "award": "获奖情况",
-  "awardDate": "2023.07",
-  "description": ["原文段落"]
-}}]
+["订单系统重构", "支付网关"]
 ```
-规则：只提取竞赛获奖，不提取其他经历。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
+规则：只提取项目名称，不提取其他字段。没有输出[]。只输出JSON。"""
+
+OVERVIEW_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+从中仅提取竞赛/获奖经历的竞赛名称列表，输出JSON数组：
+```json
+["ACM区域赛", "数学建模大赛"]
+```
+规则：只提取竞赛名称，不提取其他字段。没有输出[]。只输出JSON。"""
+
+# ==================== 第二阶段：详情提取 ====================
+
+DETAIL_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+请提取"{name}"这条教育经历的详细信息，输出JSON：
+```json
+{{ "school": "学校", "major": "专业", "degree": "学历", "studyType": "全日制/非全日制", "startDate": "2020.09", "endDate": "2024.06", "descLines": "e,f,g" }}
+```
+规则：descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
+
+DETAIL_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+请提取"{name}"这条工作经历的详细信息，输出JSON：
+```json
+{{ "companyName": "公司", "position": "职位", "startDate": "2024.07", "endDate": "2025.03", "descLines": "h,i,j" }}
+```
+规则：descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
+
+DETAIL_INTERNSHIP_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+请提取"{name}"这条实习经历的详细信息，输出JSON：
+```json
+{{ "companyName": "公司", "position": "职位", "startDate": "2023.06", "endDate": "2023.09", "descLines": "p,q,r" }}
+```
+规则：descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
+
+DETAIL_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+请提取"{name}"这条项目经历的详细信息，输出JSON：
+```json
+{{ "companyName": "所属公司", "projectName": "项目名", "role": "角色名称", "startDate": "2023.03", "endDate": "2023.12", "descLines": "u,v,w" }}
+```
+规则：role只填简短角色名。descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
+
+DETAIL_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+请提取"{name}"这条竞赛/获奖经历的详细信息，输出JSON：
+```json
+{{ "competitionName": "竞赛名", "award": "获奖情况", "awardDate": "2023.07", "descLines": "ae,af" }}
+```
+规则：descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
+
+DETAIL_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取，不要猜测或编造。
+从中提取技能标签、证书和自我评价/个人概述，输出JSON：
+```json
+{{ "skills": ["技能1"], "certificates": ["证书1"], "summaryLines": "k,l,m" }}
+```
+规则：skills填技能标签数组。certificates填证书数组。summaryLines填自我评价/个人概述对应的字母编号逗号分隔。没有的填null，数组填[]。只输出JSON。"""
@@ -1,7 +1,7 @@
 """简历解析 Service

-上传简历文件 → 解析为纯文本 → AI 并行结构化 → 写入数据库。
-依赖：file_parser（文件解析工具）、resume_extractor（AI并行提取）
+上传简历文件 → 解析为纯文本 → AI 两阶段并行结构化 → 写入数据库。
+依赖：file_parser（文件解析工具）、resume_extractor（AI两阶段并行提取）
 使用表：bg_user_resume（主表）、bg_user_resume_education/work/internship/project/competition（5张子表）
 """

@@ -25,16 +25,16 @@ from app.tool.snowflake import next_id
 class ResumeParseService:

    async def parse_and_extract(self, filename: str, content: bytes) -> dict:
-        """文件解析 + AI 并行结构化，不涉及数据库操作"""
+        """文件解析 + AI 两阶段并行结构化，不涉及数据库操作"""
        log.info(f"开始解析简历文件: {filename}")
        text = await asyncio.to_thread(parse_to_text, filename, content)
        if not text or not text.strip():
            raise ValueError("文件内容为空，无法解析")
        log.info(f"文件解析完成，文本长度: {len(text)}")

-        log.info("开始AI并行结构化提取")
+        log.info("开始AI两阶段并行结构化提取")
        parsed = await extract_all(text)
-        log.info("AI并行结构化提取完成")
+        log.info("AI两阶段并行结构化提取完成")
        return parsed

    async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int:
@@ -103,4 +103,4 @@ def _to_paragraphs(texts: list[str] | None) -> list[dict] | None:
    """将字符串数组转为 [{id, text}] 格式的描述段落"""
    if not texts:
        return None
-    return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t]
+    return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t]
@@ -1,24 +1,25 @@
 """文件解析工具

 将上传的简历文件（PDF / Word / TXT）转换为纯文本字符串。
+PDF 使用 PyMuPDF (fitz) 按文本块提取，保持段落边界和阅读顺序。
 """

 import io

-import pdfplumber
+import fitz
 from docx import Document

 from app.core.logger import log


 def parse_pdf(content: bytes) -> str:
-    """解析 PDF 文件，提取全部页面文本"""
+    """解析 PDF 文件，按文本块提取，过滤图片块，保持阅读顺序"""
    text_parts: list[str] = []
-    with pdfplumber.open(io.BytesIO(content)) as pdf:
-        for page in pdf.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text_parts.append(page_text)
+    with fitz.open(stream=content, filetype="pdf") as doc:
+        for page in doc:
+            for b in page.get_text("blocks", sort=True):
+                if b[6] == 0 and b[4].strip():  # type 0=文本块, 1=图片块
+                    text_parts.append(b[4].strip())
    return "\n".join(text_parts)


@@ -28,22 +29,16 @@ def parse_docx(content: bytes) -> str:
        doc = Document(io.BytesIO(content))
    except Exception:
        raise ValueError("无法解析该 Word 文件，如果是旧版 .doc 格式，请另存为 .docx 后重试")
-
    text_parts: list[str] = []
-
-    # 段落
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            text_parts.append(text)
-
-    # 表格
    for table in doc.tables:
        for row in table.rows:
            row_text = "\t".join(cell.text.strip() for cell in row.cells)
            if row_text.strip():
                text_parts.append(row_text)
-
    return "\n".join(text_parts)


@@ -61,7 +56,6 @@ def parse_to_text(filename: str, content: bytes) -> str:
    """根据文件名后缀自动选择解析方法，返回纯文本"""
    suffix = filename[filename.rfind("."):].lower() if "." in filename else ""
    log.info(f"解析文件: {filename}，类型: {suffix}")
-
    if suffix == ".pdf":
        return parse_pdf(content)
    elif suffix in (".docx", ".doc"):
@@ -43,7 +43,7 @@ python-multipart>=0.0.9
 python-dotenv>=1.0.0

 # 文件解析
-pdfplumber>=0.11.0
+pymupdf>=1.24.0
 python-docx>=1.1.0

 # 雪花ID