修改简历提取方式
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
"""简历两阶段并行提取
|
||||
|
||||
第一阶段:5路并行提取主表信息 + 各子表标识名(极快,输出极短)。
|
||||
第二阶段:N路并行提取每条子表记录的详情,description 用字母编号引用原文。
|
||||
第一阶段:5路并行提取主表短字段 + 各子表标识名(极快,输出极短)。
|
||||
第二阶段:N+1路并行提取每条子表记录的详情(含description原文)+ 个人信息补充(skills/certificates/summary)。
|
||||
最终组装为与原方案完全一致的 dict 结构,上下游无感知。
|
||||
"""
|
||||
|
||||
@@ -24,31 +24,6 @@ from app.tool.json_helper import parse_llm_json
|
||||
_LLM_MODEL = LLM.DOUBAO_LITE_32K
|
||||
|
||||
|
||||
# ==================== 文本编号 ====================
|
||||
|
||||
def _gen_alpha(n: int):
|
||||
"""生成 n 个字母编号:a,b,...,z,aa,ab,...,az,ba,..."""
|
||||
for i in range(n):
|
||||
yield chr(ord('a') + i) if i < 26 else chr(ord('a') + (i // 26 - 1)) + chr(ord('a') + i % 26)
|
||||
|
||||
|
||||
def _number_lines(text: str) -> tuple[dict[str, str], str]:
|
||||
"""按换行分割、过滤空行、字母编号,返回 (字母→原文dict, 带编号文本)"""
|
||||
raw_lines = [line for line in text.split("\n") if line.strip()]
|
||||
alphas = list(_gen_alpha(len(raw_lines)))
|
||||
line_map = dict(zip(alphas, raw_lines))
|
||||
numbered = "\n".join(f"[{a}] {line}" for a, line in zip(alphas, raw_lines))
|
||||
return line_map, numbered
|
||||
|
||||
|
||||
def _resolve_desc(line_map: dict[str, str], desc_str: str | None) -> list[str]:
|
||||
"""将逗号分隔的字母编号字符串解析为原文列表"""
|
||||
if not desc_str or not isinstance(desc_str, str):
|
||||
return []
|
||||
keys = [k.strip() for k in desc_str.split(",") if k.strip()]
|
||||
return [line_map[k] for k in keys if k in line_map]
|
||||
|
||||
|
||||
# ==================== LLM 调用工具 ====================
|
||||
|
||||
def _build_chain(prompt: str):
|
||||
@@ -77,9 +52,9 @@ _overview_project_chain = _build_chain(OVERVIEW_PROJECT_PROMPT)
|
||||
_overview_competition_chain = _build_chain(OVERVIEW_COMPETITION_PROMPT)
|
||||
|
||||
|
||||
async def _extract_overview(numbered_text: str) -> dict:
|
||||
async def _extract_overview(text: str) -> dict:
|
||||
"""第一阶段:5路并行提取概览信息"""
|
||||
inp = {"text": numbered_text}
|
||||
inp = {"text": text}
|
||||
profile, edu_names, work_names, proj_names, comp_names = await asyncio.gather(
|
||||
_safe_invoke(_overview_profile_chain, inp, "概览-个人信息"),
|
||||
_safe_invoke(_overview_education_chain, inp, "概览-教育"),
|
||||
@@ -99,36 +74,36 @@ async def _extract_overview(numbered_text: str) -> dict:
|
||||
|
||||
# ==================== 第二阶段:详情 ====================
|
||||
|
||||
async def _extract_detail(prompt_tpl: str, name: str, numbered_text: str, label: str) -> dict | None:
|
||||
"""单条子表记录详情提取:用 name 替换 prompt 中的 {name},发送带编号全文"""
|
||||
async def _extract_detail(prompt_tpl: str, name: str, text: str, label: str) -> dict | None:
|
||||
"""单条子表记录详情提取:用 name 替换 prompt 中的 {name}"""
|
||||
prompt = prompt_tpl.replace("{name}", name)
|
||||
chain = _build_chain(prompt)
|
||||
return await _safe_invoke(chain, {"text": numbered_text}, label)
|
||||
return await _safe_invoke(chain, {"text": text}, label)
|
||||
|
||||
|
||||
async def _extract_all_details(overview: dict, numbered_text: str) -> dict:
|
||||
"""第二阶段:根据概览结果,N路并行提取所有子表记录详情 + 个人信息的skills/certificates/summary"""
|
||||
async def _extract_all_details(overview: dict, text: str) -> dict:
|
||||
"""第二阶段:根据概览结果,N+1路并行提取所有子表记录详情 + 个人信息补充"""
|
||||
tasks: list = []
|
||||
task_meta: list[tuple[str, int]] = [] # (模块名, 索引) 用于结果归位
|
||||
task_meta: list[tuple[str, int]] = []
|
||||
|
||||
# profile 的 skills/certificates/summaryLines
|
||||
tasks.append(_extract_detail(DETAIL_PROFILE_PROMPT, "", numbered_text, "详情-个人信息补充"))
|
||||
# profile 补充:skills/certificates/summary
|
||||
tasks.append(_extract_detail(DETAIL_PROFILE_PROMPT, "", text, "详情-个人信息补充"))
|
||||
task_meta.append(("profile_extra", 0))
|
||||
|
||||
for i, name in enumerate(overview["education"]):
|
||||
tasks.append(_extract_detail(DETAIL_EDUCATION_PROMPT, name, numbered_text, f"详情-教育-{name}"))
|
||||
tasks.append(_extract_detail(DETAIL_EDUCATION_PROMPT, name, text, f"详情-教育-{name}"))
|
||||
task_meta.append(("education", i))
|
||||
for i, name in enumerate(overview["work"]):
|
||||
tasks.append(_extract_detail(DETAIL_WORK_PROMPT, name, numbered_text, f"详情-工作-{name}"))
|
||||
tasks.append(_extract_detail(DETAIL_WORK_PROMPT, name, text, f"详情-工作-{name}"))
|
||||
task_meta.append(("work", i))
|
||||
for i, name in enumerate(overview["internship"]):
|
||||
tasks.append(_extract_detail(DETAIL_INTERNSHIP_PROMPT, name, numbered_text, f"详情-实习-{name}"))
|
||||
tasks.append(_extract_detail(DETAIL_INTERNSHIP_PROMPT, name, text, f"详情-实习-{name}"))
|
||||
task_meta.append(("internship", i))
|
||||
for i, name in enumerate(overview["project"]):
|
||||
tasks.append(_extract_detail(DETAIL_PROJECT_PROMPT, name, numbered_text, f"详情-项目-{name}"))
|
||||
tasks.append(_extract_detail(DETAIL_PROJECT_PROMPT, name, text, f"详情-项目-{name}"))
|
||||
task_meta.append(("project", i))
|
||||
for i, name in enumerate(overview["competition"]):
|
||||
tasks.append(_extract_detail(DETAIL_COMPETITION_PROMPT, name, numbered_text, f"详情-竞赛-{name}"))
|
||||
tasks.append(_extract_detail(DETAIL_COMPETITION_PROMPT, name, text, f"详情-竞赛-{name}"))
|
||||
task_meta.append(("competition", i))
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
@@ -140,22 +115,19 @@ async def _extract_all_details(overview: dict, numbered_text: str) -> dict:
|
||||
|
||||
# ==================== 组装 ====================
|
||||
|
||||
def _assemble(overview: dict, details: dict, line_map: dict[str, str]) -> dict:
|
||||
def _assemble(overview: dict, details: dict) -> dict:
|
||||
"""将两阶段结果组装为与原方案一致的 dict 结构"""
|
||||
profile = overview["profile"]
|
||||
# 合并第二阶段提取的 skills/certificates/summaryLines
|
||||
profile_extra = details.get("profile_extra", [{}])[0] if details.get("profile_extra") else {}
|
||||
profile["skills"] = profile_extra.get("skills") or []
|
||||
profile["certificates"] = profile_extra.get("certificates") or []
|
||||
summary_str = profile_extra.get("summaryLines")
|
||||
summary_texts = _resolve_desc(line_map, summary_str)
|
||||
profile["summary"] = "\n".join(summary_texts) if summary_texts else None
|
||||
profile["summary"] = profile_extra.get("summary")
|
||||
result = dict(profile)
|
||||
for module in ("education", "work", "internship", "project", "competition"):
|
||||
items = []
|
||||
for item in details.get(module, []):
|
||||
desc_str = item.pop("descLines", None)
|
||||
item["description"] = _resolve_desc(line_map, desc_str)
|
||||
if not item.get("description"):
|
||||
item["description"] = []
|
||||
items.append(item)
|
||||
result[module] = items
|
||||
return result
|
||||
@@ -165,17 +137,14 @@ def _assemble(overview: dict, details: dict, line_map: dict[str, str]) -> dict:
|
||||
|
||||
async def extract_all(text: str) -> dict:
|
||||
"""两阶段并行提取简历,返回与原方案完全一致的结构化数据"""
|
||||
line_map, numbered_text = _number_lines(text)
|
||||
log.info(f"文本编号完成,共 {len(line_map)} 行")
|
||||
|
||||
log.info("第一阶段:5路并行概览提取")
|
||||
overview = await _extract_overview(numbered_text)
|
||||
overview = await _extract_overview(text)
|
||||
log.info(f"概览完成 - 教育:{len(overview['education'])} 工作:{len(overview['work'])} 实习:{len(overview['internship'])} 项目:{len(overview['project'])} 竞赛:{len(overview['competition'])}")
|
||||
|
||||
total = sum(len(overview[m]) for m in ("education", "work", "internship", "project", "competition"))
|
||||
log.info(f"第二阶段:{total}路并行详情提取")
|
||||
details = await _extract_all_details(overview, numbered_text)
|
||||
log.info(f"第二阶段:{total + 1}路并行详情提取")
|
||||
details = await _extract_all_details(overview, text)
|
||||
|
||||
result = _assemble(overview, details, line_map)
|
||||
result = _assemble(overview, details)
|
||||
log.info("两阶段提取完成,数据组装完毕")
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user