优化简历提取速度
This commit is contained in:
@@ -1,63 +1,181 @@
|
||||
"""简历并行提取:将完整简历文本拆分为5个AI任务并行提取"""
|
||||
"""简历两阶段并行提取
|
||||
|
||||
第一阶段:5路并行提取主表信息 + 各子表标识名(极快,输出极短)。
|
||||
第二阶段:N路并行提取每条子表记录的详情,description 用字母编号引用原文。
|
||||
最终组装为与原方案完全一致的 dict 结构,上下游无感知。
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
from langchain_core.output_parsers import StrOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
|
||||
from app.ai.models import LLM
|
||||
from app.ai.resume_extractor.prompts import (
|
||||
PROFILE_PROMPT, EDUCATION_PROMPT, WORK_PROMPT,
|
||||
PROJECT_PROMPT, COMPETITION_PROMPT,
|
||||
OVERVIEW_PROFILE_PROMPT, OVERVIEW_EDUCATION_PROMPT, OVERVIEW_WORK_PROMPT,
|
||||
OVERVIEW_PROJECT_PROMPT, OVERVIEW_COMPETITION_PROMPT,
|
||||
DETAIL_PROFILE_PROMPT, DETAIL_EDUCATION_PROMPT, DETAIL_WORK_PROMPT,
|
||||
DETAIL_INTERNSHIP_PROMPT, DETAIL_PROJECT_PROMPT, DETAIL_COMPETITION_PROMPT,
|
||||
)
|
||||
from app.core.logger import log
|
||||
from app.tool.json_helper import parse_llm_json
|
||||
|
||||
_LLM_MODEL = LLM.DOUBAO_PRO_32K
|
||||
|
||||
|
||||
# ==================== 文本编号 ====================
|
||||
|
||||
def _gen_alpha(n: int):
|
||||
"""生成 n 个字母编号:a,b,...,z,aa,ab,...,az,ba,..."""
|
||||
for i in range(n):
|
||||
yield chr(ord('a') + i) if i < 26 else chr(ord('a') + (i // 26 - 1)) + chr(ord('a') + i % 26)
|
||||
|
||||
|
||||
def _number_lines(text: str) -> tuple[dict[str, str], str]:
|
||||
"""按换行分割、过滤空行、字母编号,返回 (字母→原文dict, 带编号文本)"""
|
||||
raw_lines = [line for line in text.split("\n") if line.strip()]
|
||||
alphas = list(_gen_alpha(len(raw_lines)))
|
||||
line_map = dict(zip(alphas, raw_lines))
|
||||
numbered = "\n".join(f"[{a}] {line}" for a, line in zip(alphas, raw_lines))
|
||||
return line_map, numbered
|
||||
|
||||
|
||||
def _resolve_desc(line_map: dict[str, str], desc_str: str | None) -> list[str]:
|
||||
"""将逗号分隔的字母编号字符串解析为原文列表"""
|
||||
if not desc_str or not isinstance(desc_str, str):
|
||||
return []
|
||||
keys = [k.strip() for k in desc_str.split(",") if k.strip()]
|
||||
return [line_map[k] for k in keys if k in line_map]
|
||||
|
||||
|
||||
# ==================== LLM 调用工具 ====================
|
||||
|
||||
def _build_chain(prompt: str):
|
||||
"""构建单个提取链:prompt → LLM → 文本输出"""
|
||||
return (
|
||||
ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")])
|
||||
| LLM.JIAYU_CLAUDE_SONNET_4_5.create(temperature=0)
|
||||
| StrOutputParser()
|
||||
)
|
||||
|
||||
|
||||
# 5 条独立的提取链
|
||||
_profile_chain = _build_chain(PROFILE_PROMPT)
|
||||
_education_chain = _build_chain(EDUCATION_PROMPT)
|
||||
_work_chain = _build_chain(WORK_PROMPT)
|
||||
_project_chain = _build_chain(PROJECT_PROMPT)
|
||||
_competition_chain = _build_chain(COMPETITION_PROMPT)
|
||||
|
||||
|
||||
async def extract_all(text: str) -> dict:
|
||||
"""asyncio.gather 并行提取简历所有模块,返回合并后的结构化数据"""
|
||||
log.info("开始5路并行AI提取")
|
||||
inp = {"text": text}
|
||||
|
||||
profile, education, work_intern, project, competition = await asyncio.gather(
|
||||
_safe_invoke(_profile_chain, inp, "个人信息"),
|
||||
_safe_invoke(_education_chain, inp, "教育经历"),
|
||||
_safe_invoke(_work_chain, inp, "工作+实习经历"),
|
||||
_safe_invoke(_project_chain, inp, "项目经历"),
|
||||
_safe_invoke(_competition_chain, inp, "竞赛经历"),
|
||||
)
|
||||
|
||||
result = profile if isinstance(profile, dict) else {}
|
||||
result["education"] = education if isinstance(education, list) else []
|
||||
result["work"] = work_intern.get("work", []) if isinstance(work_intern, dict) else []
|
||||
result["internship"] = work_intern.get("internship", []) if isinstance(work_intern, dict) else []
|
||||
result["project"] = project if isinstance(project, list) else []
|
||||
result["competition"] = competition if isinstance(competition, list) else []
|
||||
return result
|
||||
"""构建提取链:prompt → LLM → 文本输出"""
|
||||
return ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")]) | _LLM_MODEL.create(temperature=0) | StrOutputParser()
|
||||
|
||||
|
||||
async def _safe_invoke(chain, inp: dict, label: str):
|
||||
"""单个链调用,失败返回空"""
|
||||
"""单个链调用,记录耗时,失败返回空"""
|
||||
start = time.perf_counter()
|
||||
try:
|
||||
raw = await chain.ainvoke(inp)
|
||||
log.info(f"AI提取[{label}]完成,耗时: {time.perf_counter() - start:.2f}s")
|
||||
return parse_llm_json(raw)
|
||||
except Exception as e:
|
||||
log.warning(f"AI提取[{label}]失败: {e}")
|
||||
return {} if "个人信息" in label else []
|
||||
log.warning(f"AI提取[{label}]失败,耗时: {time.perf_counter() - start:.2f}s,错误: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ==================== 第一阶段:概览 ====================
|
||||
|
||||
_overview_profile_chain = _build_chain(OVERVIEW_PROFILE_PROMPT)
|
||||
_overview_education_chain = _build_chain(OVERVIEW_EDUCATION_PROMPT)
|
||||
_overview_work_chain = _build_chain(OVERVIEW_WORK_PROMPT)
|
||||
_overview_project_chain = _build_chain(OVERVIEW_PROJECT_PROMPT)
|
||||
_overview_competition_chain = _build_chain(OVERVIEW_COMPETITION_PROMPT)
|
||||
|
||||
|
||||
async def _extract_overview(numbered_text: str) -> dict:
|
||||
"""第一阶段:5路并行提取概览信息"""
|
||||
inp = {"text": numbered_text}
|
||||
profile, edu_names, work_names, proj_names, comp_names = await asyncio.gather(
|
||||
_safe_invoke(_overview_profile_chain, inp, "概览-个人信息"),
|
||||
_safe_invoke(_overview_education_chain, inp, "概览-教育"),
|
||||
_safe_invoke(_overview_work_chain, inp, "概览-工作实习"),
|
||||
_safe_invoke(_overview_project_chain, inp, "概览-项目"),
|
||||
_safe_invoke(_overview_competition_chain, inp, "概览-竞赛"),
|
||||
)
|
||||
return {
|
||||
"profile": profile if isinstance(profile, dict) else {},
|
||||
"education": edu_names if isinstance(edu_names, list) else [],
|
||||
"work": work_names.get("work", []) if isinstance(work_names, dict) else [],
|
||||
"internship": work_names.get("internship", []) if isinstance(work_names, dict) else [],
|
||||
"project": proj_names if isinstance(proj_names, list) else [],
|
||||
"competition": comp_names if isinstance(comp_names, list) else [],
|
||||
}
|
||||
|
||||
|
||||
# ==================== 第二阶段:详情 ====================
|
||||
|
||||
async def _extract_detail(prompt_tpl: str, name: str, numbered_text: str, label: str) -> dict | None:
|
||||
"""单条子表记录详情提取:用 name 替换 prompt 中的 {name},发送带编号全文"""
|
||||
prompt = prompt_tpl.replace("{name}", name)
|
||||
chain = _build_chain(prompt)
|
||||
return await _safe_invoke(chain, {"text": numbered_text}, label)
|
||||
|
||||
|
||||
async def _extract_all_details(overview: dict, numbered_text: str) -> dict:
|
||||
"""第二阶段:根据概览结果,N路并行提取所有子表记录详情 + 个人信息的skills/certificates/summary"""
|
||||
tasks: list = []
|
||||
task_meta: list[tuple[str, int]] = [] # (模块名, 索引) 用于结果归位
|
||||
|
||||
# profile 的 skills/certificates/summaryLines
|
||||
tasks.append(_extract_detail(DETAIL_PROFILE_PROMPT, "", numbered_text, "详情-个人信息补充"))
|
||||
task_meta.append(("profile_extra", 0))
|
||||
|
||||
for i, name in enumerate(overview["education"]):
|
||||
tasks.append(_extract_detail(DETAIL_EDUCATION_PROMPT, name, numbered_text, f"详情-教育-{name}"))
|
||||
task_meta.append(("education", i))
|
||||
for i, name in enumerate(overview["work"]):
|
||||
tasks.append(_extract_detail(DETAIL_WORK_PROMPT, name, numbered_text, f"详情-工作-{name}"))
|
||||
task_meta.append(("work", i))
|
||||
for i, name in enumerate(overview["internship"]):
|
||||
tasks.append(_extract_detail(DETAIL_INTERNSHIP_PROMPT, name, numbered_text, f"详情-实习-{name}"))
|
||||
task_meta.append(("internship", i))
|
||||
for i, name in enumerate(overview["project"]):
|
||||
tasks.append(_extract_detail(DETAIL_PROJECT_PROMPT, name, numbered_text, f"详情-项目-{name}"))
|
||||
task_meta.append(("project", i))
|
||||
for i, name in enumerate(overview["competition"]):
|
||||
tasks.append(_extract_detail(DETAIL_COMPETITION_PROMPT, name, numbered_text, f"详情-竞赛-{name}"))
|
||||
task_meta.append(("competition", i))
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
details: dict[str, list] = {"profile_extra": [], "education": [], "work": [], "internship": [], "project": [], "competition": []}
|
||||
for (module, _idx), result in zip(task_meta, results):
|
||||
details[module].append(result if isinstance(result, dict) else {})
|
||||
return details
|
||||
|
||||
|
||||
# ==================== 组装 ====================
|
||||
|
||||
def _assemble(overview: dict, details: dict, line_map: dict[str, str]) -> dict:
|
||||
"""将两阶段结果组装为与原方案一致的 dict 结构"""
|
||||
profile = overview["profile"]
|
||||
# 合并第二阶段提取的 skills/certificates/summaryLines
|
||||
profile_extra = details.get("profile_extra", [{}])[0] if details.get("profile_extra") else {}
|
||||
profile["skills"] = profile_extra.get("skills") or []
|
||||
profile["certificates"] = profile_extra.get("certificates") or []
|
||||
summary_str = profile_extra.get("summaryLines")
|
||||
summary_texts = _resolve_desc(line_map, summary_str)
|
||||
profile["summary"] = "\n".join(summary_texts) if summary_texts else None
|
||||
result = dict(profile)
|
||||
for module in ("education", "work", "internship", "project", "competition"):
|
||||
items = []
|
||||
for item in details.get(module, []):
|
||||
desc_str = item.pop("descLines", None)
|
||||
item["description"] = _resolve_desc(line_map, desc_str)
|
||||
items.append(item)
|
||||
result[module] = items
|
||||
return result
|
||||
|
||||
|
||||
# ==================== 入口 ====================
|
||||
|
||||
async def extract_all(text: str) -> dict:
|
||||
"""两阶段并行提取简历,返回与原方案完全一致的结构化数据"""
|
||||
line_map, numbered_text = _number_lines(text)
|
||||
log.info(f"文本编号完成,共 {len(line_map)} 行")
|
||||
|
||||
log.info("第一阶段:5路并行概览提取")
|
||||
overview = await _extract_overview(numbered_text)
|
||||
log.info(f"概览完成 - 教育:{len(overview['education'])} 工作:{len(overview['work'])} 实习:{len(overview['internship'])} 项目:{len(overview['project'])} 竞赛:{len(overview['competition'])}")
|
||||
|
||||
total = sum(len(overview[m]) for m in ("education", "work", "internship", "project", "competition"))
|
||||
log.info(f"第二阶段:{total}路并行详情提取")
|
||||
details = await _extract_all_details(overview, numbered_text)
|
||||
|
||||
result = _assemble(overview, details, line_map)
|
||||
log.info("两阶段提取完成,数据组装完毕")
|
||||
return result
|
||||
|
||||
@@ -1,79 +1,87 @@
|
||||
"""简历各模块提取的 System Prompt
|
||||
"""简历两阶段提取 Prompt
|
||||
|
||||
注意:prompt 中的 JSON 示例花括号必须用 {{ }} 转义,避免被 ChatPromptTemplate 当作变量。
|
||||
第一阶段(概览):5路并行,只提取主表短字段和子表标识名,不提取 description。
|
||||
第二阶段(详情):N路并行,每条子表记录单独提取全部字段,description 用字母编号引用。
|
||||
花括号用 {{ }} 转义,避免被 ChatPromptTemplate 当作变量。{name} 为运行时替换的记录标识名。
|
||||
"""
|
||||
|
||||
PROFILE_PROMPT = """从简历文本中仅提取个人基本信息,原文提取不要改写,输出JSON:
|
||||
```json
|
||||
{{
|
||||
"name": "姓名",
|
||||
"email": "邮箱",
|
||||
"mobileNumber": "手机号",
|
||||
"city": "所在城市",
|
||||
"wechatNumber": "微信号",
|
||||
"portfolioUrl": "作品集链接",
|
||||
"skills": ["技能1"],
|
||||
"certificates": ["证书1"],
|
||||
"summary": "个人概述原文"
|
||||
}}
|
||||
```
|
||||
规则:只提取个人信息,不提取经历内容。summary只填"自我评价/个人概述"原文。没有的填null,数组填[]。只输出JSON。"""
|
||||
# ==================== 第一阶段:概览提取 ====================
|
||||
|
||||
EDUCATION_PROMPT = """从简历文本中仅提取教育经历,原文提取不要改写,输出JSON数组:
|
||||
OVERVIEW_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造,没有的填null。
|
||||
从中仅提取个人基本信息(不含技能、证书、自我评价),输出JSON:
|
||||
```json
|
||||
[{{
|
||||
"school": "学校",
|
||||
"major": "专业",
|
||||
"degree": "学历",
|
||||
"studyType": "全日制/非全日制",
|
||||
"startDate": "2020.09",
|
||||
"endDate": "2024.06",
|
||||
"description": ["原文段落"]
|
||||
}}]
|
||||
{{ "name": "姓名", "email": "邮箱", "mobileNumber": "手机号", "city": "所在城市", "wechatNumber": "微信号", "portfolioUrl": "作品集链接" }}
|
||||
```
|
||||
规则:只提取教育经历,不提取工作/实习/项目/竞赛。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
|
||||
规则:只提取以上6个字段,不提取skills/certificates/summary/经历。没有的填null。只输出JSON。"""
|
||||
|
||||
WORK_PROMPT = """从简历文本中仅提取工作经历和实习经历,原文提取不要改写,输出JSON:
|
||||
OVERVIEW_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
从中仅提取教育经历的学校名称列表,输出JSON数组:
|
||||
```json
|
||||
{{
|
||||
"work": [{{
|
||||
"companyName": "公司",
|
||||
"position": "职位",
|
||||
"startDate": "2024.07",
|
||||
"endDate": "2025.03",
|
||||
"description": ["原文段落"]
|
||||
}}],
|
||||
"internship": [{{
|
||||
"companyName": "公司",
|
||||
"position": "职位",
|
||||
"startDate": "2023.06",
|
||||
"endDate": "2023.09",
|
||||
"description": ["原文段落"]
|
||||
}}]
|
||||
}}
|
||||
["北京大学", "清华大学"]
|
||||
```
|
||||
规则:标注"实习"的归internship,其余归work。不提取项目/教育/竞赛。时间格式YYYY.MM。没有填[]。只输出JSON。"""
|
||||
规则:只提取学校名称,不提取其他字段。没有输出[]。只输出JSON。"""
|
||||
|
||||
PROJECT_PROMPT = """从简历文本中仅提取项目经历,原文提取不要改写,输出JSON数组:
|
||||
OVERVIEW_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
从中仅提取工作经历和实习经历的公司名称列表,输出JSON:
|
||||
```json
|
||||
[{{
|
||||
"companyName": "所属公司",
|
||||
"projectName": "项目名",
|
||||
"role": "角色名称(如:后端开发、项目经理、前端工程师,只填角色名不填职责描述)",
|
||||
"startDate": "2023.03",
|
||||
"endDate": "2023.12",
|
||||
"description": ["原文段落"]
|
||||
}}]
|
||||
{{ "work": ["阿里巴巴", "腾讯"], "internship": ["字节跳动"] }}
|
||||
```
|
||||
规则:只提取项目经历,不提取工作/实习/教育/竞赛。role只填简短角色名,职责内容放description。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
|
||||
规则:标注"实习"的归internship,其余归work。只提取公司名称。没有填[]。只输出JSON。"""
|
||||
|
||||
COMPETITION_PROMPT = """从简历文本中仅提取竞赛/获奖经历,原文提取不要改写,输出JSON数组:
|
||||
OVERVIEW_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
从中仅提取项目经历的项目名称列表,输出JSON数组:
|
||||
```json
|
||||
[{{
|
||||
"competitionName": "竞赛名",
|
||||
"award": "获奖情况",
|
||||
"awardDate": "2023.07",
|
||||
"description": ["原文段落"]
|
||||
}}]
|
||||
["订单系统重构", "支付网关"]
|
||||
```
|
||||
规则:只提取竞赛获奖,不提取其他经历。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
|
||||
规则:只提取项目名称,不提取其他字段。没有输出[]。只输出JSON。"""
|
||||
|
||||
OVERVIEW_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
从中仅提取竞赛/获奖经历的竞赛名称列表,输出JSON数组:
|
||||
```json
|
||||
["ACM区域赛", "数学建模大赛"]
|
||||
```
|
||||
规则:只提取竞赛名称,不提取其他字段。没有输出[]。只输出JSON。"""
|
||||
|
||||
# ==================== 第二阶段:详情提取 ====================
|
||||
|
||||
DETAIL_EDUCATION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
请提取"{name}"这条教育经历的详细信息,输出JSON:
|
||||
```json
|
||||
{{ "school": "学校", "major": "专业", "degree": "学历", "studyType": "全日制/非全日制", "startDate": "2020.09", "endDate": "2024.06", "descLines": "e,f,g" }}
|
||||
```
|
||||
规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
|
||||
|
||||
DETAIL_WORK_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
请提取"{name}"这条工作经历的详细信息,输出JSON:
|
||||
```json
|
||||
{{ "companyName": "公司", "position": "职位", "startDate": "2024.07", "endDate": "2025.03", "descLines": "h,i,j" }}
|
||||
```
|
||||
规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
|
||||
|
||||
DETAIL_INTERNSHIP_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
请提取"{name}"这条实习经历的详细信息,输出JSON:
|
||||
```json
|
||||
{{ "companyName": "公司", "position": "职位", "startDate": "2023.06", "endDate": "2023.09", "descLines": "p,q,r" }}
|
||||
```
|
||||
规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
|
||||
|
||||
DETAIL_PROJECT_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
请提取"{name}"这条项目经历的详细信息,输出JSON:
|
||||
```json
|
||||
{{ "companyName": "所属公司", "projectName": "项目名", "role": "角色名称", "startDate": "2023.03", "endDate": "2023.12", "descLines": "u,v,w" }}
|
||||
```
|
||||
规则:role只填简短角色名。descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
|
||||
|
||||
DETAIL_COMPETITION_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
请提取"{name}"这条竞赛/获奖经历的详细信息,输出JSON:
|
||||
```json
|
||||
{{ "competitionName": "竞赛名", "award": "获奖情况", "awardDate": "2023.07", "descLines": "ae,af" }}
|
||||
```
|
||||
规则:descLines填描述内容对应的字母编号逗号分隔。时间格式YYYY.MM。没有的填null。只输出JSON。"""
|
||||
|
||||
DETAIL_PROFILE_PROMPT = """简历文本每行以 [字母编号] 开头。严格根据简历原文提取,不要猜测或编造。
|
||||
从中提取技能标签、证书和自我评价/个人概述,输出JSON:
|
||||
```json
|
||||
{{ "skills": ["技能1"], "certificates": ["证书1"], "summaryLines": "k,l,m" }}
|
||||
```
|
||||
规则:skills填技能标签数组。certificates填证书数组。summaryLines填自我评价/个人概述对应的字母编号逗号分隔。没有的填null,数组填[]。只输出JSON。"""
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""简历解析 Service
|
||||
|
||||
上传简历文件 → 解析为纯文本 → AI 并行结构化 → 写入数据库。
|
||||
依赖:file_parser(文件解析工具)、resume_extractor(AI并行提取)
|
||||
上传简历文件 → 解析为纯文本 → AI 两阶段并行结构化 → 写入数据库。
|
||||
依赖:file_parser(文件解析工具)、resume_extractor(AI两阶段并行提取)
|
||||
使用表:bg_user_resume(主表)、bg_user_resume_education/work/internship/project/competition(5张子表)
|
||||
"""
|
||||
|
||||
@@ -25,16 +25,16 @@ from app.tool.snowflake import next_id
|
||||
class ResumeParseService:
|
||||
|
||||
async def parse_and_extract(self, filename: str, content: bytes) -> dict:
|
||||
"""文件解析 + AI 并行结构化,不涉及数据库操作"""
|
||||
"""文件解析 + AI 两阶段并行结构化,不涉及数据库操作"""
|
||||
log.info(f"开始解析简历文件: {filename}")
|
||||
text = await asyncio.to_thread(parse_to_text, filename, content)
|
||||
if not text or not text.strip():
|
||||
raise ValueError("文件内容为空,无法解析")
|
||||
log.info(f"文件解析完成,文本长度: {len(text)}")
|
||||
|
||||
log.info("开始AI并行结构化提取")
|
||||
log.info("开始AI两阶段并行结构化提取")
|
||||
parsed = await extract_all(text)
|
||||
log.info("AI并行结构化提取完成")
|
||||
log.info("AI两阶段并行结构化提取完成")
|
||||
return parsed
|
||||
|
||||
async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int:
|
||||
@@ -103,4 +103,4 @@ def _to_paragraphs(texts: list[str] | None) -> list[dict] | None:
|
||||
"""将字符串数组转为 [{id, text}] 格式的描述段落"""
|
||||
if not texts:
|
||||
return None
|
||||
return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t]
|
||||
return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t]
|
||||
|
||||
+8
-14
@@ -1,24 +1,25 @@
|
||||
"""文件解析工具
|
||||
|
||||
将上传的简历文件(PDF / Word / TXT)转换为纯文本字符串。
|
||||
PDF 使用 PyMuPDF (fitz) 按文本块提取,保持段落边界和阅读顺序。
|
||||
"""
|
||||
|
||||
import io
|
||||
|
||||
import pdfplumber
|
||||
import fitz
|
||||
from docx import Document
|
||||
|
||||
from app.core.logger import log
|
||||
|
||||
|
||||
def parse_pdf(content: bytes) -> str:
|
||||
"""解析 PDF 文件,提取全部页面文本"""
|
||||
"""解析 PDF 文件,按文本块提取,过滤图片块,保持阅读顺序"""
|
||||
text_parts: list[str] = []
|
||||
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
with fitz.open(stream=content, filetype="pdf") as doc:
|
||||
for page in doc:
|
||||
for b in page.get_text("blocks", sort=True):
|
||||
if b[6] == 0 and b[4].strip(): # type 0=文本块, 1=图片块
|
||||
text_parts.append(b[4].strip())
|
||||
return "\n".join(text_parts)
|
||||
|
||||
|
||||
@@ -28,22 +29,16 @@ def parse_docx(content: bytes) -> str:
|
||||
doc = Document(io.BytesIO(content))
|
||||
except Exception:
|
||||
raise ValueError("无法解析该 Word 文件,如果是旧版 .doc 格式,请另存为 .docx 后重试")
|
||||
|
||||
text_parts: list[str] = []
|
||||
|
||||
# 段落
|
||||
for para in doc.paragraphs:
|
||||
text = para.text.strip()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
# 表格
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
row_text = "\t".join(cell.text.strip() for cell in row.cells)
|
||||
if row_text.strip():
|
||||
text_parts.append(row_text)
|
||||
|
||||
return "\n".join(text_parts)
|
||||
|
||||
|
||||
@@ -61,7 +56,6 @@ def parse_to_text(filename: str, content: bytes) -> str:
|
||||
"""根据文件名后缀自动选择解析方法,返回纯文本"""
|
||||
suffix = filename[filename.rfind("."):].lower() if "." in filename else ""
|
||||
log.info(f"解析文件: {filename},类型: {suffix}")
|
||||
|
||||
if suffix == ".pdf":
|
||||
return parse_pdf(content)
|
||||
elif suffix in (".docx", ".doc"):
|
||||
|
||||
+1
-1
@@ -43,7 +43,7 @@ python-multipart>=0.0.9
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# 文件解析
|
||||
pdfplumber>=0.11.0
|
||||
pymupdf>=1.24.0
|
||||
python-docx>=1.1.0
|
||||
|
||||
# 雪花ID
|
||||
|
||||
Reference in New Issue
Block a user