优化简历提取速度

This commit is contained in:
zk
2026-04-29 15:02:05 +08:00
parent 0a15e320f5
commit b26c0a7262
5 changed files with 247 additions and 127 deletions
+160 -42
View File
@@ -1,63 +1,181 @@
"""简历并行提取:将完整简历文本拆分为5个AI任务并行提取""" """简历两阶段并行提取
第一阶段:5路并行提取主表信息 + 各子表标识名(极快,输出极短)。
第二阶段:N路并行提取每条子表记录的详情,description 用字母编号引用原文。
最终组装为与原方案完全一致的 dict 结构,上下游无感知。
"""
import asyncio import asyncio
import time
from langchain_core.output_parsers import StrOutputParser from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate from langchain_core.prompts import ChatPromptTemplate
from app.ai.models import LLM from app.ai.models import LLM
from app.ai.resume_extractor.prompts import ( from app.ai.resume_extractor.prompts import (
PROFILE_PROMPT, EDUCATION_PROMPT, WORK_PROMPT, OVERVIEW_PROFILE_PROMPT, OVERVIEW_EDUCATION_PROMPT, OVERVIEW_WORK_PROMPT,
PROJECT_PROMPT, COMPETITION_PROMPT, OVERVIEW_PROJECT_PROMPT, OVERVIEW_COMPETITION_PROMPT,
DETAIL_PROFILE_PROMPT, DETAIL_EDUCATION_PROMPT, DETAIL_WORK_PROMPT,
DETAIL_INTERNSHIP_PROMPT, DETAIL_PROJECT_PROMPT, DETAIL_COMPETITION_PROMPT,
) )
from app.core.logger import log from app.core.logger import log
from app.tool.json_helper import parse_llm_json from app.tool.json_helper import parse_llm_json
_LLM_MODEL = LLM.DOUBAO_PRO_32K
# ==================== 文本编号 ====================
def _gen_alpha(n: int):
"""生成 n 个字母编号:a,b,...,z,aa,ab,...,az,ba,..."""
for i in range(n):
yield chr(ord('a') + i) if i < 26 else chr(ord('a') + (i // 26 - 1)) + chr(ord('a') + i % 26)
def _number_lines(text: str) -> tuple[dict[str, str], str]:
"""按换行分割、过滤空行、字母编号,返回 (字母→原文dict, 带编号文本)"""
raw_lines = [line for line in text.split("\n") if line.strip()]
alphas = list(_gen_alpha(len(raw_lines)))
line_map = dict(zip(alphas, raw_lines))
numbered = "\n".join(f"[{a}] {line}" for a, line in zip(alphas, raw_lines))
return line_map, numbered
def _resolve_desc(line_map: dict[str, str], desc_str: str | None) -> list[str]:
"""将逗号分隔的字母编号字符串解析为原文列表"""
if not desc_str or not isinstance(desc_str, str):
return []
keys = [k.strip() for k in desc_str.split(",") if k.strip()]
return [line_map[k] for k in keys if k in line_map]
# ==================== LLM 调用工具 ====================
def _build_chain(prompt: str): def _build_chain(prompt: str):
"""构建单个提取链:prompt → LLM → 文本输出""" """构建提取链:prompt → LLM → 文本输出"""
return ( return ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")]) | _LLM_MODEL.create(temperature=0) | StrOutputParser()
ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")])
| LLM.JIAYU_CLAUDE_SONNET_4_5.create(temperature=0)
| StrOutputParser()
)
# 5 条独立的提取链
_profile_chain = _build_chain(PROFILE_PROMPT)
_education_chain = _build_chain(EDUCATION_PROMPT)
_work_chain = _build_chain(WORK_PROMPT)
_project_chain = _build_chain(PROJECT_PROMPT)
_competition_chain = _build_chain(COMPETITION_PROMPT)
async def extract_all(text: str) -> dict:
"""asyncio.gather 并行提取简历所有模块,返回合并后的结构化数据"""
log.info("开始5路并行AI提取")
inp = {"text": text}
profile, education, work_intern, project, competition = await asyncio.gather(
_safe_invoke(_profile_chain, inp, "个人信息"),
_safe_invoke(_education_chain, inp, "教育经历"),
_safe_invoke(_work_chain, inp, "工作+实习经历"),
_safe_invoke(_project_chain, inp, "项目经历"),
_safe_invoke(_competition_chain, inp, "竞赛经历"),
)
result = profile if isinstance(profile, dict) else {}
result["education"] = education if isinstance(education, list) else []
result["work"] = work_intern.get("work", []) if isinstance(work_intern, dict) else []
result["internship"] = work_intern.get("internship", []) if isinstance(work_intern, dict) else []
result["project"] = project if isinstance(project, list) else []
result["competition"] = competition if isinstance(competition, list) else []
return result
async def _safe_invoke(chain, inp: dict, label: str): async def _safe_invoke(chain, inp: dict, label: str):
"""单个链调用,失败返回空""" """单个链调用,记录耗时,失败返回空"""
start = time.perf_counter()
try: try:
raw = await chain.ainvoke(inp) raw = await chain.ainvoke(inp)
log.info(f"AI提取[{label}]完成,耗时: {time.perf_counter() - start:.2f}s")
return parse_llm_json(raw) return parse_llm_json(raw)
except Exception as e: except Exception as e:
log.warning(f"AI提取[{label}]失败: {e}") log.warning(f"AI提取[{label}]失败,耗时: {time.perf_counter() - start:.2f}s,错误: {e}")
return {} if "个人信息" in label else [] return None
# ==================== 第一阶段:概览 ====================
_overview_profile_chain = _build_chain(OVERVIEW_PROFILE_PROMPT)
_overview_education_chain = _build_chain(OVERVIEW_EDUCATION_PROMPT)
_overview_work_chain = _build_chain(OVERVIEW_WORK_PROMPT)
_overview_project_chain = _build_chain(OVERVIEW_PROJECT_PROMPT)
_overview_competition_chain = _build_chain(OVERVIEW_COMPETITION_PROMPT)
async def _extract_overview(numbered_text: str) -> dict:
"""第一阶段:5路并行提取概览信息"""
inp = {"text": numbered_text}
profile, edu_names, work_names, proj_names, comp_names = await asyncio.gather(
_safe_invoke(_overview_profile_chain, inp, "概览-个人信息"),
_safe_invoke(_overview_education_chain, inp, "概览-教育"),
_safe_invoke(_overview_work_chain, inp, "概览-工作实习"),
_safe_invoke(_overview_project_chain, inp, "概览-项目"),
_safe_invoke(_overview_competition_chain, inp, "概览-竞赛"),
)
return {
"profile": profile if isinstance(profile, dict) else {},
"education": edu_names if isinstance(edu_names, list) else [],
"work": work_names.get("work", []) if isinstance(work_names, dict) else [],
"internship": work_names.get("internship", []) if isinstance(work_names, dict) else [],
"project": proj_names if isinstance(proj_names, list) else [],
"competition": comp_names if isinstance(comp_names, list) else [],
}
# ==================== 第二阶段:详情 ====================
async def _extract_detail(prompt_tpl: str, name: str, numbered_text: str, label: str) -> dict | None:
"""单条子表记录详情提取:用 name 替换 prompt 中的 {name},发送带编号全文"""
prompt = prompt_tpl.replace("{name}", name)
chain = _build_chain(prompt)
return await _safe_invoke(chain, {"text": numbered_text}, label)
async def _extract_all_details(overview: dict, numbered_text: str) -> dict:
"""第二阶段:根据概览结果,N路并行提取所有子表记录详情 + 个人信息的skills/certificates/summary"""
tasks: list = []
task_meta: list[tuple[str, int]] = [] # (模块名, 索引) 用于结果归位
# profile 的 skills/certificates/summaryLines
tasks.append(_extract_detail(DETAIL_PROFILE_PROMPT, "", numbered_text, "详情-个人信息补充"))
task_meta.append(("profile_extra", 0))
for i, name in enumerate(overview["education"]):
tasks.append(_extract_detail(DETAIL_EDUCATION_PROMPT, name, numbered_text, f"详情-教育-{name}"))
task_meta.append(("education", i))
for i, name in enumerate(overview["work"]):
tasks.append(_extract_detail(DETAIL_WORK_PROMPT, name, numbered_text, f"详情-工作-{name}"))
task_meta.append(("work", i))
for i, name in enumerate(overview["internship"]):
tasks.append(_extract_detail(DETAIL_INTERNSHIP_PROMPT, name, numbered_text, f"详情-实习-{name}"))
task_meta.append(("internship", i))
for i, name in enumerate(overview["project"]):
tasks.append(_extract_detail(DETAIL_PROJECT_PROMPT, name, numbered_text, f"详情-项目-{name}"))
task_meta.append(("project", i))
for i, name in enumerate(overview["competition"]):
tasks.append(_extract_detail(DETAIL_COMPETITION_PROMPT, name, numbered_text, f"详情-竞赛-{name}"))
task_meta.append(("competition", i))
results = await asyncio.gather(*tasks)
details: dict[str, list] = {"profile_extra": [], "education": [], "work": [], "internship": [], "project": [], "competition": []}
for (module, _idx), result in zip(task_meta, results):
details[module].append(result if isinstance(result, dict) else {})
return details
# ==================== 组装 ====================
def _assemble(overview: dict, details: dict, line_map: dict[str, str]) -> dict:
"""将两阶段结果组装为与原方案一致的 dict 结构"""
profile = overview["profile"]
# 合并第二阶段提取的 skills/certificates/summaryLines
profile_extra = details.get("profile_extra", [{}])[0] if details.get("profile_extra") else {}
profile["skills"] = profile_extra.get("skills") or []
profile["certificates"] = profile_extra.get("certificates") or []
summary_str = profile_extra.get("summaryLines")
summary_texts = _resolve_desc(line_map, summary_str)
profile["summary"] = "\n".join(summary_texts) if summary_texts else None
result = dict(profile)
for module in ("education", "work", "internship", "project", "competition"):
items = []
for item in details.get(module, []):
desc_str = item.pop("descLines", None)
item["description"] = _resolve_desc(line_map, desc_str)
items.append(item)
result[module] = items
return result
# ==================== 入口 ====================
async def extract_all(text: str) -> dict:
"""两阶段并行提取简历,返回与原方案完全一致的结构化数据"""
line_map, numbered_text = _number_lines(text)
log.info(f"文本编号完成,共 {len(line_map)}")
log.info("第一阶段:5路并行概览提取")
overview = await _extract_overview(numbered_text)
log.info(f"概览完成 - 教育:{len(overview['education'])} 工作:{len(overview['work'])} 实习:{len(overview['internship'])} 项目:{len(overview['project'])} 竞赛:{len(overview['competition'])}")
total = sum(len(overview[m]) for m in ("education", "work", "internship", "project", "competition"))
log.info(f"第二阶段:{total}路并行详情提取")
details = await _extract_all_details(overview, numbered_text)
result = _assemble(overview, details, line_map)
log.info("两阶段提取完成,数据组装完毕")
return result
+72 -64
View File
@@ -1,79 +1,87 @@
"""简历各模块提取的 System Prompt """简历两阶段提取 Prompt
注意:prompt 中的 JSON 示例花括号必须用 {{ }} 转义,避免被 ChatPromptTemplate 当作变量 第一阶段(概览):5路并行,只提取主表短字段和子表标识名,不提取 description
第二阶段(详情):N路并行,每条子表记录单独提取全部字段,description 用字母编号引用。
花括号用 {{ }} 转义,避免被 ChatPromptTemplate 当作变量。{name} 为运行时替换的记录标识名。
""" """
PROFILE_PROMPT = """从简历文本中仅提取个人基本信息,原文提取不要改写,输出JSON: # ==================== 第一阶段:概览提取 ====================
```json
{{
"name": "姓名",
"email": "邮箱",
"mobileNumber": "手机号",
"city": "所在城市",
"wechatNumber": "微信号",
"portfolioUrl": "作品集链接",
"skills": ["技能1"],
"certificates": ["证书1"],
"summary": "个人概述原文"
}}
```
规则:只提取个人信息,不提取经历内容。summary只填"自我评价/个人概述"原文。没有的填null,数组填[]。只输出JSON。"""
EDUCATION_PROMPT = """简历文本中仅提取教育经历,原文提取不要改写,输出JSON数组: OVERVIEW_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取不要猜测或编造,没有的填null。
从中仅提取个人基本信息(不含技能、证书、自我评价),输出JSON:
```json ```json
[{{ {{ "name": "姓名", "email": "邮箱", "mobileNumber": "手机号", "city": "所在城市", "wechatNumber": "微信号", "portfolioUrl": "作品集链接" }}
"school": "学校",
"major": "专业",
"degree": "学历",
"studyType": "全日制/非全日制",
"startDate": "2020.09",
"endDate": "2024.06",
"description": ["原文段落"]
}}]
``` ```
规则:只提取教育经历,不提取工作/实习/项目/竞赛。时间格式YYYY.MM。没有输出[]。只输出JSON。""" 规则:只提取以上6个字段,不提取skills/certificates/summary/经历。没有的填null。只输出JSON。"""
WORK_PROMPT = """简历文本中仅提取工作经历和实习经历,原文提取不要改写,输出JSON OVERVIEW_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取不要猜测或编造。
从中仅提取教育经历的学校名称列表,输出JSON数组:
```json ```json
{{ ["北京大学", "清华大学"]
"work": [{{
"companyName": "公司",
"position": "职位",
"startDate": "2024.07",
"endDate": "2025.03",
"description": ["原文段落"]
}}],
"internship": [{{
"companyName": "公司",
"position": "职位",
"startDate": "2023.06",
"endDate": "2023.09",
"description": ["原文段落"]
}}]
}}
``` ```
规则:标注"实习"的归internship,其余归work。不提取项目/教育/竞赛。时间格式YYYY.MM。没有[]。只输出JSON。""" 规则:只提取学校名称,不提取其他字段。没有输出[]。只输出JSON。"""
PROJECT_PROMPT = """简历文本中仅提取项目经历,原文提取不要改写,输出JSON数组: OVERVIEW_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取不要猜测或编造。
从中仅提取工作经历和实习经历的公司名称列表,输出JSON:
```json ```json
[{{ {{ "work": ["阿里巴巴", "腾讯"], "internship": ["字节跳动"] }}
"companyName": "所属公司",
"projectName": "项目名",
"role": "角色名称(如:后端开发、项目经理、前端工程师,只填角色名不填职责描述)",
"startDate": "2023.03",
"endDate": "2023.12",
"description": ["原文段落"]
}}]
``` ```
规则:只提取项目经历,不提取工作/实习/教育/竞赛。role只填简短角色名,职责内容放description。时间格式YYYY.MM。没有输出[]。只输出JSON。""" 规则:标注"实习"的归internship,其余归work。只提取公司名称。没有[]。只输出JSON。"""
COMPETITION_PROMPT = """简历文本中仅提取竞赛/获奖经历,原文提取不要改写,输出JSON数组: OVERVIEW_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取不要猜测或编造。
从中仅提取项目经历的项目名称列表,输出JSON数组:
```json ```json
[{{ ["订单系统重构", "支付网关"]
"competitionName": "竞赛名",
"award": "获奖情况",
"awardDate": "2023.07",
"description": ["原文段落"]
}}]
``` ```
规则:只提取竞赛获奖,不提取其他经历。时间格式YYYY.MM。没有输出[]。只输出JSON。""" 规则:只提取项目名称,不提取其他字段。没有输出[]。只输出JSON。"""
OVERVIEW_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
从中仅提取竞赛/获奖经历的竞赛名称列表,输出JSON数组:
```json
["ACM区域赛", "数学建模大赛"]
```
规则:只提取竞赛名称,不提取其他字段。没有输出[]。只输出JSON。"""
# ==================== 第二阶段:详情提取 ====================
DETAIL_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
请提取"{name}"这条教育经历的详细信息,输出JSON
```json
{{ "school": "学校", "major": "专业", "degree": "学历", "studyType": "全日制/非全日制", "startDate": "2020.09", "endDate": "2024.06", "descLines": "e,f,g" }}
```
规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
DETAIL_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
请提取"{name}"这条工作经历的详细信息,输出JSON
```json
{{ "companyName": "公司", "position": "职位", "startDate": "2024.07", "endDate": "2025.03", "descLines": "h,i,j" }}
```
规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
DETAIL_INTERNSHIP_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
请提取"{name}"这条实习经历的详细信息,输出JSON
```json
{{ "companyName": "公司", "position": "职位", "startDate": "2023.06", "endDate": "2023.09", "descLines": "p,q,r" }}
```
规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
DETAIL_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
请提取"{name}"这条项目经历的详细信息,输出JSON
```json
{{ "companyName": "所属公司", "projectName": "项目名", "role": "角色名称", "startDate": "2023.03", "endDate": "2023.12", "descLines": "u,v,w" }}
```
规则:role只填简短角色名。descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
DETAIL_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
请提取"{name}"这条竞赛/获奖经历的详细信息,输出JSON:
```json
{{ "competitionName": "竞赛名", "award": "获奖情况", "awardDate": "2023.07", "descLines": "ae,af" }}
```
规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
DETAIL_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
从中提取技能标签、证书和自我评价/个人概述,输出JSON:
```json
{{ "skills": ["技能1"], "certificates": ["证书1"], "summaryLines": "k,l,m" }}
```
规则:skills填技能标签数组。certificates填证书数组。summaryLines填自我评价/个人概述对应的字母编号逗号分隔。没有的填null,数组填[]。只输出JSON。"""
+5 -5
View File
@@ -1,7 +1,7 @@
"""简历解析 Service """简历解析 Service
上传简历文件 → 解析为纯文本 → AI 并行结构化 → 写入数据库。 上传简历文件 → 解析为纯文本 → AI 两阶段并行结构化 → 写入数据库。
依赖:file_parser(文件解析工具)、resume_extractorAI并行提取) 依赖:file_parser(文件解析工具)、resume_extractorAI两阶段并行提取)
使用表:bg_user_resume(主表)、bg_user_resume_education/work/internship/project/competition5张子表) 使用表:bg_user_resume(主表)、bg_user_resume_education/work/internship/project/competition5张子表)
""" """
@@ -25,16 +25,16 @@ from app.tool.snowflake import next_id
class ResumeParseService: class ResumeParseService:
async def parse_and_extract(self, filename: str, content: bytes) -> dict: async def parse_and_extract(self, filename: str, content: bytes) -> dict:
"""文件解析 + AI 并行结构化,不涉及数据库操作""" """文件解析 + AI 两阶段并行结构化,不涉及数据库操作"""
log.info(f"开始解析简历文件: {filename}") log.info(f"开始解析简历文件: {filename}")
text = await asyncio.to_thread(parse_to_text, filename, content) text = await asyncio.to_thread(parse_to_text, filename, content)
if not text or not text.strip(): if not text or not text.strip():
raise ValueError("文件内容为空,无法解析") raise ValueError("文件内容为空,无法解析")
log.info(f"文件解析完成,文本长度: {len(text)}") log.info(f"文件解析完成,文本长度: {len(text)}")
log.info("开始AI并行结构化提取") log.info("开始AI两阶段并行结构化提取")
parsed = await extract_all(text) parsed = await extract_all(text)
log.info("AI并行结构化提取完成") log.info("AI两阶段并行结构化提取完成")
return parsed return parsed
async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int: async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int:
+8 -14
View File
@@ -1,24 +1,25 @@
"""文件解析工具 """文件解析工具
将上传的简历文件(PDF / Word / TXT)转换为纯文本字符串。 将上传的简历文件(PDF / Word / TXT)转换为纯文本字符串。
PDF 使用 PyMuPDF (fitz) 按文本块提取,保持段落边界和阅读顺序。
""" """
import io import io
import pdfplumber import fitz
from docx import Document from docx import Document
from app.core.logger import log from app.core.logger import log
def parse_pdf(content: bytes) -> str: def parse_pdf(content: bytes) -> str:
"""解析 PDF 文件,提取全部页面文本""" """解析 PDF 文件,按文本块提取,过滤图片块,保持阅读顺序"""
text_parts: list[str] = [] text_parts: list[str] = []
with pdfplumber.open(io.BytesIO(content)) as pdf: with fitz.open(stream=content, filetype="pdf") as doc:
for page in pdf.pages: for page in doc:
page_text = page.extract_text() for b in page.get_text("blocks", sort=True):
if page_text: if b[6] == 0 and b[4].strip(): # type 0=文本块, 1=图片块
text_parts.append(page_text) text_parts.append(b[4].strip())
return "\n".join(text_parts) return "\n".join(text_parts)
@@ -28,22 +29,16 @@ def parse_docx(content: bytes) -> str:
doc = Document(io.BytesIO(content)) doc = Document(io.BytesIO(content))
except Exception: except Exception:
raise ValueError("无法解析该 Word 文件,如果是旧版 .doc 格式,请另存为 .docx 后重试") raise ValueError("无法解析该 Word 文件,如果是旧版 .doc 格式,请另存为 .docx 后重试")
text_parts: list[str] = [] text_parts: list[str] = []
# 段落
for para in doc.paragraphs: for para in doc.paragraphs:
text = para.text.strip() text = para.text.strip()
if text: if text:
text_parts.append(text) text_parts.append(text)
# 表格
for table in doc.tables: for table in doc.tables:
for row in table.rows: for row in table.rows:
row_text = "\t".join(cell.text.strip() for cell in row.cells) row_text = "\t".join(cell.text.strip() for cell in row.cells)
if row_text.strip(): if row_text.strip():
text_parts.append(row_text) text_parts.append(row_text)
return "\n".join(text_parts) return "\n".join(text_parts)
@@ -61,7 +56,6 @@ def parse_to_text(filename: str, content: bytes) -> str:
"""根据文件名后缀自动选择解析方法,返回纯文本""" """根据文件名后缀自动选择解析方法,返回纯文本"""
suffix = filename[filename.rfind("."):].lower() if "." in filename else "" suffix = filename[filename.rfind("."):].lower() if "." in filename else ""
log.info(f"解析文件: {filename},类型: {suffix}") log.info(f"解析文件: {filename},类型: {suffix}")
if suffix == ".pdf": if suffix == ".pdf":
return parse_pdf(content) return parse_pdf(content)
elif suffix in (".docx", ".doc"): elif suffix in (".docx", ".doc"):
+1 -1
View File
@@ -43,7 +43,7 @@ python-multipart>=0.0.9
python-dotenv>=1.0.0 python-dotenv>=1.0.0
# 文件解析 # 文件解析
pdfplumber>=0.11.0 pymupdf>=1.24.0
python-docx>=1.1.0 python-docx>=1.1.0
# 雪花ID # 雪花ID