diff --git a/app/ai/resume_extractor/extractor.py b/app/ai/resume_extractor/extractor.py index 7cadfa2..ee6a148 100644 --- a/app/ai/resume_extractor/extractor.py +++ b/app/ai/resume_extractor/extractor.py @@ -1,63 +1,181 @@ -"""简历并行提取:将完整简历文本拆分为5个AI任务并行提取""" +"""简历两阶段并行提取 + +第一阶段:5路并行提取主表信息 + 各子表标识名(极快,输出极短)。 +第二阶段:N路并行提取每条子表记录的详情,description 用字母编号引用原文。 +最终组装为与原方案完全一致的 dict 结构,上下游无感知。 +""" import asyncio +import time from langchain_core.output_parsers import StrOutputParser from langchain_core.prompts import ChatPromptTemplate from app.ai.models import LLM from app.ai.resume_extractor.prompts import ( - PROFILE_PROMPT, EDUCATION_PROMPT, WORK_PROMPT, - PROJECT_PROMPT, COMPETITION_PROMPT, + OVERVIEW_PROFILE_PROMPT, OVERVIEW_EDUCATION_PROMPT, OVERVIEW_WORK_PROMPT, + OVERVIEW_PROJECT_PROMPT, OVERVIEW_COMPETITION_PROMPT, + DETAIL_PROFILE_PROMPT, DETAIL_EDUCATION_PROMPT, DETAIL_WORK_PROMPT, + DETAIL_INTERNSHIP_PROMPT, DETAIL_PROJECT_PROMPT, DETAIL_COMPETITION_PROMPT, ) from app.core.logger import log from app.tool.json_helper import parse_llm_json +_LLM_MODEL = LLM.DOUBAO_PRO_32K + + +# ==================== 文本编号 ==================== + +def _gen_alpha(n: int): + """生成 n 个字母编号:a,b,...,z,aa,ab,...,az,ba,...""" + for i in range(n): + yield chr(ord('a') + i) if i < 26 else chr(ord('a') + (i // 26 - 1)) + chr(ord('a') + i % 26) + + +def _number_lines(text: str) -> tuple[dict[str, str], str]: + """按换行分割、过滤空行、字母编号,返回 (字母→原文dict, 带编号文本)""" + raw_lines = [line for line in text.split("\n") if line.strip()] + alphas = list(_gen_alpha(len(raw_lines))) + line_map = dict(zip(alphas, raw_lines)) + numbered = "\n".join(f"[{a}] {line}" for a, line in zip(alphas, raw_lines)) + return line_map, numbered + + +def _resolve_desc(line_map: dict[str, str], desc_str: str | None) -> list[str]: + """将逗号分隔的字母编号字符串解析为原文列表""" + if not desc_str or not isinstance(desc_str, str): + return [] + keys = [k.strip() for k in desc_str.split(",") if k.strip()] + return [line_map[k] for k in keys if k in line_map] + + +# ==================== LLM 调用工具 ==================== def _build_chain(prompt: str): - """构建单个提取链:prompt → LLM → 文本输出""" - return ( - ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")]) - | LLM.JIAYU_CLAUDE_SONNET_4_5.create(temperature=0) - | StrOutputParser() - ) - - -# 5 条独立的提取链 -_profile_chain = _build_chain(PROFILE_PROMPT) -_education_chain = _build_chain(EDUCATION_PROMPT) -_work_chain = _build_chain(WORK_PROMPT) -_project_chain = _build_chain(PROJECT_PROMPT) -_competition_chain = _build_chain(COMPETITION_PROMPT) - - -async def extract_all(text: str) -> dict: - """asyncio.gather 并行提取简历所有模块,返回合并后的结构化数据""" - log.info("开始5路并行AI提取") - inp = {"text": text} - - profile, education, work_intern, project, competition = await asyncio.gather( - _safe_invoke(_profile_chain, inp, "个人信息"), - _safe_invoke(_education_chain, inp, "教育经历"), - _safe_invoke(_work_chain, inp, "工作+实习经历"), - _safe_invoke(_project_chain, inp, "项目经历"), - _safe_invoke(_competition_chain, inp, "竞赛经历"), - ) - - result = profile if isinstance(profile, dict) else {} - result["education"] = education if isinstance(education, list) else [] - result["work"] = work_intern.get("work", []) if isinstance(work_intern, dict) else [] - result["internship"] = work_intern.get("internship", []) if isinstance(work_intern, dict) else [] - result["project"] = project if isinstance(project, list) else [] - result["competition"] = competition if isinstance(competition, list) else [] - return result + """构建提取链:prompt → LLM → 文本输出""" + return ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")]) | _LLM_MODEL.create(temperature=0) | StrOutputParser() async def _safe_invoke(chain, inp: dict, label: str): - """单个链调用,失败返回空""" + """单个链调用,记录耗时,失败返回空""" + start = time.perf_counter() try: raw = await chain.ainvoke(inp) + log.info(f"AI提取[{label}]完成,耗时: {time.perf_counter() - start:.2f}s") return parse_llm_json(raw) except Exception as e: - log.warning(f"AI提取[{label}]失败: {e}") - return {} if "个人信息" in label else [] + log.warning(f"AI提取[{label}]失败,耗时: {time.perf_counter() - start:.2f}s,错误: {e}") + return None + + +# ==================== 第一阶段:概览 ==================== + +_overview_profile_chain = _build_chain(OVERVIEW_PROFILE_PROMPT) +_overview_education_chain = _build_chain(OVERVIEW_EDUCATION_PROMPT) +_overview_work_chain = _build_chain(OVERVIEW_WORK_PROMPT) +_overview_project_chain = _build_chain(OVERVIEW_PROJECT_PROMPT) +_overview_competition_chain = _build_chain(OVERVIEW_COMPETITION_PROMPT) + + +async def _extract_overview(numbered_text: str) -> dict: + """第一阶段:5路并行提取概览信息""" + inp = {"text": numbered_text} + profile, edu_names, work_names, proj_names, comp_names = await asyncio.gather( + _safe_invoke(_overview_profile_chain, inp, "概览-个人信息"), + _safe_invoke(_overview_education_chain, inp, "概览-教育"), + _safe_invoke(_overview_work_chain, inp, "概览-工作实习"), + _safe_invoke(_overview_project_chain, inp, "概览-项目"), + _safe_invoke(_overview_competition_chain, inp, "概览-竞赛"), + ) + return { + "profile": profile if isinstance(profile, dict) else {}, + "education": edu_names if isinstance(edu_names, list) else [], + "work": work_names.get("work", []) if isinstance(work_names, dict) else [], + "internship": work_names.get("internship", []) if isinstance(work_names, dict) else [], + "project": proj_names if isinstance(proj_names, list) else [], + "competition": comp_names if isinstance(comp_names, list) else [], + } + + +# ==================== 第二阶段:详情 ==================== + +async def _extract_detail(prompt_tpl: str, name: str, numbered_text: str, label: str) -> dict | None: + """单条子表记录详情提取:用 name 替换 prompt 中的 {name},发送带编号全文""" + prompt = prompt_tpl.replace("{name}", name) + chain = _build_chain(prompt) + return await _safe_invoke(chain, {"text": numbered_text}, label) + + +async def _extract_all_details(overview: dict, numbered_text: str) -> dict: + """第二阶段:根据概览结果,N路并行提取所有子表记录详情 + 个人信息的skills/certificates/summary""" + tasks: list = [] + task_meta: list[tuple[str, int]] = [] # (模块名, 索引) 用于结果归位 + + # profile 的 skills/certificates/summaryLines + tasks.append(_extract_detail(DETAIL_PROFILE_PROMPT, "", numbered_text, "详情-个人信息补充")) + task_meta.append(("profile_extra", 0)) + + for i, name in enumerate(overview["education"]): + tasks.append(_extract_detail(DETAIL_EDUCATION_PROMPT, name, numbered_text, f"详情-教育-{name}")) + task_meta.append(("education", i)) + for i, name in enumerate(overview["work"]): + tasks.append(_extract_detail(DETAIL_WORK_PROMPT, name, numbered_text, f"详情-工作-{name}")) + task_meta.append(("work", i)) + for i, name in enumerate(overview["internship"]): + tasks.append(_extract_detail(DETAIL_INTERNSHIP_PROMPT, name, numbered_text, f"详情-实习-{name}")) + task_meta.append(("internship", i)) + for i, name in enumerate(overview["project"]): + tasks.append(_extract_detail(DETAIL_PROJECT_PROMPT, name, numbered_text, f"详情-项目-{name}")) + task_meta.append(("project", i)) + for i, name in enumerate(overview["competition"]): + tasks.append(_extract_detail(DETAIL_COMPETITION_PROMPT, name, numbered_text, f"详情-竞赛-{name}")) + task_meta.append(("competition", i)) + + results = await asyncio.gather(*tasks) + details: dict[str, list] = {"profile_extra": [], "education": [], "work": [], "internship": [], "project": [], "competition": []} + for (module, _idx), result in zip(task_meta, results): + details[module].append(result if isinstance(result, dict) else {}) + return details + + +# ==================== 组装 ==================== + +def _assemble(overview: dict, details: dict, line_map: dict[str, str]) -> dict: + """将两阶段结果组装为与原方案一致的 dict 结构""" + profile = overview["profile"] + # 合并第二阶段提取的 skills/certificates/summaryLines + profile_extra = details.get("profile_extra", [{}])[0] if details.get("profile_extra") else {} + profile["skills"] = profile_extra.get("skills") or [] + profile["certificates"] = profile_extra.get("certificates") or [] + summary_str = profile_extra.get("summaryLines") + summary_texts = _resolve_desc(line_map, summary_str) + profile["summary"] = "\n".join(summary_texts) if summary_texts else None + result = dict(profile) + for module in ("education", "work", "internship", "project", "competition"): + items = [] + for item in details.get(module, []): + desc_str = item.pop("descLines", None) + item["description"] = _resolve_desc(line_map, desc_str) + items.append(item) + result[module] = items + return result + + +# ==================== 入口 ==================== + +async def extract_all(text: str) -> dict: + """两阶段并行提取简历,返回与原方案完全一致的结构化数据""" + line_map, numbered_text = _number_lines(text) + log.info(f"文本编号完成,共 {len(line_map)} 行") + + log.info("第一阶段:5路并行概览提取") + overview = await _extract_overview(numbered_text) + log.info(f"概览完成 - 教育:{len(overview['education'])} 工作:{len(overview['work'])} 实习:{len(overview['internship'])} 项目:{len(overview['project'])} 竞赛:{len(overview['competition'])}") + + total = sum(len(overview[m]) for m in ("education", "work", "internship", "project", "competition")) + log.info(f"第二阶段:{total}路并行详情提取") + details = await _extract_all_details(overview, numbered_text) + + result = _assemble(overview, details, line_map) + log.info("两阶段提取完成,数据组装完毕") + return result diff --git a/app/ai/resume_extractor/prompts.py b/app/ai/resume_extractor/prompts.py index 81b6061..d6c6e4f 100644 --- a/app/ai/resume_extractor/prompts.py +++ b/app/ai/resume_extractor/prompts.py @@ -1,79 +1,87 @@ -"""简历各模块提取的 System Prompt +"""简历两阶段提取 Prompt -注意:prompt 中的 JSON 示例花括号必须用 {{ }} 转义,避免被 ChatPromptTemplate 当作变量。 +第一阶段(概览):5路并行,只提取主表短字段和子表标识名,不提取 description。 +第二阶段(详情):N路并行,每条子表记录单独提取全部字段,description 用字母编号引用。 +花括号用 {{ }} 转义,避免被 ChatPromptTemplate 当作变量。{name} 为运行时替换的记录标识名。 """ -PROFILE_PROMPT = """从简历文本中仅提取个人基本信息,原文提取不要改写,输出JSON: -```json -{{ - "name": "姓名", - "email": "邮箱", - "mobileNumber": "手机号", - "city": "所在城市", - "wechatNumber": "微信号", - "portfolioUrl": "作品集链接", - "skills": ["技能1"], - "certificates": ["证书1"], - "summary": "个人概述原文" -}} -``` -规则:只提取个人信息,不提取经历内容。summary只填"自我评价/个人概述"原文。没有的填null,数组填[]。只输出JSON。""" +# ==================== 第一阶段:概览提取 ==================== -EDUCATION_PROMPT = """从简历文本中仅提取教育经历,原文提取不要改写,输出JSON数组: +OVERVIEW_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造,没有的填null。 +从中仅提取个人基本信息(不含技能、证书、自我评价),输出JSON: ```json -[{{ - "school": "学校", - "major": "专业", - "degree": "学历", - "studyType": "全日制/非全日制", - "startDate": "2020.09", - "endDate": "2024.06", - "description": ["原文段落"] -}}] +{{ "name": "姓名", "email": "邮箱", "mobileNumber": "手机号", "city": "所在城市", "wechatNumber": "微信号", "portfolioUrl": "作品集链接" }} ``` -规则:只提取教育经历,不提取工作/实习/项目/竞赛。时间格式YYYY.MM。没有输出[]。只输出JSON。""" +规则:只提取以上6个字段,不提取skills/certificates/summary/经历。没有的填null。只输出JSON。""" -WORK_PROMPT = """从简历文本中仅提取工作经历和实习经历,原文提取不要改写,输出JSON: +OVERVIEW_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +从中仅提取教育经历的学校名称列表,输出JSON数组: ```json -{{ - "work": [{{ - "companyName": "公司", - "position": "职位", - "startDate": "2024.07", - "endDate": "2025.03", - "description": ["原文段落"] - }}], - "internship": [{{ - "companyName": "公司", - "position": "职位", - "startDate": "2023.06", - "endDate": "2023.09", - "description": ["原文段落"] - }}] -}} +["北京大学", "清华大学"] ``` -规则:标注"实习"的归internship,其余归work。不提取项目/教育/竞赛。时间格式YYYY.MM。没有填[]。只输出JSON。""" +规则:只提取学校名称,不提取其他字段。没有输出[]。只输出JSON。""" -PROJECT_PROMPT = """从简历文本中仅提取项目经历,原文提取不要改写,输出JSON数组: +OVERVIEW_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +从中仅提取工作经历和实习经历的公司名称列表,输出JSON: ```json -[{{ - "companyName": "所属公司", - "projectName": "项目名", - "role": "角色名称(如:后端开发、项目经理、前端工程师,只填角色名不填职责描述)", - "startDate": "2023.03", - "endDate": "2023.12", - "description": ["原文段落"] -}}] +{{ "work": ["阿里巴巴", "腾讯"], "internship": ["字节跳动"] }} ``` -规则:只提取项目经历,不提取工作/实习/教育/竞赛。role只填简短角色名,职责内容放description。时间格式YYYY.MM。没有输出[]。只输出JSON。""" +规则:标注"实习"的归internship,其余归work。只提取公司名称。没有填[]。只输出JSON。""" -COMPETITION_PROMPT = """从简历文本中仅提取竞赛/获奖经历,原文提取不要改写,输出JSON数组: +OVERVIEW_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +从中仅提取项目经历的项目名称列表,输出JSON数组: ```json -[{{ - "competitionName": "竞赛名", - "award": "获奖情况", - "awardDate": "2023.07", - "description": ["原文段落"] -}}] +["订单系统重构", "支付网关"] ``` -规则:只提取竞赛获奖,不提取其他经历。时间格式YYYY.MM。没有输出[]。只输出JSON。""" \ No newline at end of file +规则:只提取项目名称,不提取其他字段。没有输出[]。只输出JSON。""" + +OVERVIEW_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +从中仅提取竞赛/获奖经历的竞赛名称列表,输出JSON数组: +```json +["ACM区域赛", "数学建模大赛"] +``` +规则:只提取竞赛名称,不提取其他字段。没有输出[]。只输出JSON。""" + +# ==================== 第二阶段:详情提取 ==================== + +DETAIL_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +请提取"{name}"这条教育经历的详细信息,输出JSON: +```json +{{ "school": "学校", "major": "专业", "degree": "学历", "studyType": "全日制/非全日制", "startDate": "2020.09", "endDate": "2024.06", "descLines": "e,f,g" }} +``` +规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。""" + +DETAIL_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +请提取"{name}"这条工作经历的详细信息,输出JSON: +```json +{{ "companyName": "公司", "position": "职位", "startDate": "2024.07", "endDate": "2025.03", "descLines": "h,i,j" }} +``` +规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。""" + +DETAIL_INTERNSHIP_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +请提取"{name}"这条实习经历的详细信息,输出JSON: +```json +{{ "companyName": "公司", "position": "职位", "startDate": "2023.06", "endDate": "2023.09", "descLines": "p,q,r" }} +``` +规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。""" + +DETAIL_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +请提取"{name}"这条项目经历的详细信息,输出JSON: +```json +{{ "companyName": "所属公司", "projectName": "项目名", "role": "角色名称", "startDate": "2023.03", "endDate": "2023.12", "descLines": "u,v,w" }} +``` +规则:role只填简短角色名。descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。""" + +DETAIL_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +请提取"{name}"这条竞赛/获奖经历的详细信息,输出JSON: +```json +{{ "competitionName": "竞赛名", "award": "获奖情况", "awardDate": "2023.07", "descLines": "ae,af" }} +``` +规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。""" + +DETAIL_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。 +从中提取技能标签、证书和自我评价/个人概述,输出JSON: +```json +{{ "skills": ["技能1"], "certificates": ["证书1"], "summaryLines": "k,l,m" }} +``` +规则:skills填技能标签数组。certificates填证书数组。summaryLines填自我评价/个人概述对应的字母编号逗号分隔。没有的填null,数组填[]。只输出JSON。""" diff --git a/app/services/resume_parse_service.py b/app/services/resume_parse_service.py index bd25cd1..db14f22 100644 --- a/app/services/resume_parse_service.py +++ b/app/services/resume_parse_service.py @@ -1,7 +1,7 @@ """简历解析 Service -上传简历文件 → 解析为纯文本 → AI 并行结构化 → 写入数据库。 -依赖:file_parser(文件解析工具)、resume_extractor(AI并行提取) +上传简历文件 → 解析为纯文本 → AI 两阶段并行结构化 → 写入数据库。 +依赖:file_parser(文件解析工具)、resume_extractor(AI两阶段并行提取) 使用表:bg_user_resume(主表)、bg_user_resume_education/work/internship/project/competition(5张子表) """ @@ -25,16 +25,16 @@ from app.tool.snowflake import next_id class ResumeParseService: async def parse_and_extract(self, filename: str, content: bytes) -> dict: - """文件解析 + AI 并行结构化,不涉及数据库操作""" + """文件解析 + AI 两阶段并行结构化,不涉及数据库操作""" log.info(f"开始解析简历文件: {filename}") text = await asyncio.to_thread(parse_to_text, filename, content) if not text or not text.strip(): raise ValueError("文件内容为空,无法解析") log.info(f"文件解析完成,文本长度: {len(text)}") - log.info("开始AI并行结构化提取") + log.info("开始AI两阶段并行结构化提取") parsed = await extract_all(text) - log.info("AI并行结构化提取完成") + log.info("AI两阶段并行结构化提取完成") return parsed async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int: @@ -103,4 +103,4 @@ def _to_paragraphs(texts: list[str] | None) -> list[dict] | None: """将字符串数组转为 [{id, text}] 格式的描述段落""" if not texts: return None - return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t] \ No newline at end of file + return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t] diff --git a/app/tool/file_parser.py b/app/tool/file_parser.py index 2920d35..6e8a080 100644 --- a/app/tool/file_parser.py +++ b/app/tool/file_parser.py @@ -1,24 +1,25 @@ """文件解析工具 将上传的简历文件(PDF / Word / TXT)转换为纯文本字符串。 +PDF 使用 PyMuPDF (fitz) 按文本块提取,保持段落边界和阅读顺序。 """ import io -import pdfplumber +import fitz from docx import Document from app.core.logger import log def parse_pdf(content: bytes) -> str: - """解析 PDF 文件,提取全部页面文本""" + """解析 PDF 文件,按文本块提取,过滤图片块,保持阅读顺序""" text_parts: list[str] = [] - with pdfplumber.open(io.BytesIO(content)) as pdf: - for page in pdf.pages: - page_text = page.extract_text() - if page_text: - text_parts.append(page_text) + with fitz.open(stream=content, filetype="pdf") as doc: + for page in doc: + for b in page.get_text("blocks", sort=True): + if b[6] == 0 and b[4].strip(): # type 0=文本块, 1=图片块 + text_parts.append(b[4].strip()) return "\n".join(text_parts) @@ -28,22 +29,16 @@ def parse_docx(content: bytes) -> str: doc = Document(io.BytesIO(content)) except Exception: raise ValueError("无法解析该 Word 文件,如果是旧版 .doc 格式,请另存为 .docx 后重试") - text_parts: list[str] = [] - - # 段落 for para in doc.paragraphs: text = para.text.strip() if text: text_parts.append(text) - - # 表格 for table in doc.tables: for row in table.rows: row_text = "\t".join(cell.text.strip() for cell in row.cells) if row_text.strip(): text_parts.append(row_text) - return "\n".join(text_parts) @@ -61,7 +56,6 @@ def parse_to_text(filename: str, content: bytes) -> str: """根据文件名后缀自动选择解析方法,返回纯文本""" suffix = filename[filename.rfind("."):].lower() if "." in filename else "" log.info(f"解析文件: {filename},类型: {suffix}") - if suffix == ".pdf": return parse_pdf(content) elif suffix in (".docx", ".doc"): diff --git a/requirements.txt b/requirements.txt index 129f8b9..d13cb58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,7 +43,7 @@ python-multipart>=0.0.9 python-dotenv>=1.0.0 # 文件解析 -pdfplumber>=0.11.0 +pymupdf>=1.24.0 python-docx>=1.1.0 # 雪花ID