From 7ac1e723a4e5f85ec72ef54bf16300b3d1acd8f9 Mon Sep 17 00:00:00 2001 From: zk Date: Fri, 3 Apr 2026 11:20:17 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E7=AE=80=E5=8E=86=E4=B8=8A?= =?UTF-8?q?=E4=BC=A0=E4=B8=BA=E5=A4=9A=E8=B7=AF=E4=B8=8A=E4=BC=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .kiro/steering/项目结构说明.md | 7 +- app/ai/models.py | 6 +- app/ai/resume_extractor/__init__.py | 1 + app/ai/resume_extractor/extractor.py | 61 +++++++++++++ app/ai/resume_extractor/prompts.py | 79 ++++++++++++++++ app/services/resume_parse_service.py | 132 +++------------------------ 6 files changed, 164 insertions(+), 122 deletions(-) create mode 100644 app/ai/resume_extractor/__init__.py create mode 100644 app/ai/resume_extractor/extractor.py create mode 100644 app/ai/resume_extractor/prompts.py diff --git a/.kiro/steering/项目结构说明.md b/.kiro/steering/项目结构说明.md index a9a2516..12878ee 100644 --- a/.kiro/steering/项目结构说明.md +++ b/.kiro/steering/项目结构说明.md @@ -31,7 +31,10 @@ offerpie_python_ai/ │ └─ responses.py # 统一响应模型 StandardResponse(code/msg/data/timestamp/uuid) │ ├─ ai/ # **AI 能力层** - │ └─ models.py # LLM 模型枚举(LLM.DOUBAO_PRO_256K、DEEPSEEK_V3、GPT_4O 等),基于 LangChain ChatOpenAI + │ ├─ models.py # LLM 模型枚举(LLM.DOUBAO_PRO_256K、DEEPSEEK_V3、GPT_4O 等),基于 LangChain ChatOpenAI + │ └─ resume_extractor/ # 简历 AI 提取模块 + │ ├─ prompts.py # 5 个提取任务的 System Prompt(个人信息/教育/工作+实习/项目/竞赛) + │ └─ extractor.py # AI 并行提取(extract_all 入口,asyncio.gather 5 路并行) │ ├─ api/ # **路由层**(REST API 接口) │ ├─ health.py # 健康检查接口 GET /health/ @@ -62,7 +65,7 @@ offerpie_python_ai/ |------|----------|-------------| | **config** | 统一配置管理,基于 Pydantic Settings,支持 .env 文件加载 | `Settings`(数据库、Redis、LLM供应商、JWT、CORS、日志等全部配置项) | | **core** | 核心基础设施:数据库连接、Redis连接、鉴权、日志、中间件、异常处理、统一响应 | `database.py`、`redis.py`、`auth.py`、`middleware.py`、`exceptions.py`、`logger.py`、`StandardResponse` | -| **ai** | AI 模型管理,封装多供应商 LLM 实例创建,基于 LangChain ChatOpenAI | `LLM` 枚举(火山引擎:doubao/deepseek,心缘:gpt-4o/claude) | +| **ai** | AI 模型管理 + 业务 AI 能力 | `LLM` 枚举、`resume_extractor/`(简历并行提取:5路 AI 同时提取个人信息/教育/工作+实习/项目/竞赛) | | **api** | REST API 路由定义 | `health.py`(健康检查)、`resume.py`(简历上传解析) | | **models** | SQLAlchemy ORM 模型,与 Java 端共享同一数据库 | `FuncPermission`、`UserFuncPermissionStock`、`UserFuncUsageLog`、`UserResume`、`UserResumeEducation`/`Work`/`Internship`/`Project`/`Competition` | | **tool** | 无状态通用工具,不依赖数据库/Redis/用户上下文 | `file_parser.py`(PDF/Word/TXT 文件解析为纯文本)、`snowflake.py`(雪花ID生成) | diff --git a/app/ai/models.py b/app/ai/models.py index 7350217..4d60727 100644 --- a/app/ai/models.py +++ b/app/ai/models.py @@ -22,9 +22,8 @@ class LLM(Enum): """所有可用模型,每个枚举值 = (模型名, api_key函数, base_url函数)""" # 火山引擎 - DOUBAO_PRO_256K = ("doubao-pro-256k", *_VOLCENGINE) - DOUBAO_PRO_32K = ("doubao-pro-32k", *_VOLCENGINE) - DOUBAO_LITE_128K = ("doubao-lite-128k", *_VOLCENGINE) + + DOUBAO_PRO_32K = ("doubao-1-5-pro-32k-250115", *_VOLCENGINE) DEEPSEEK_V3 = ("deepseek-v3-250324", *_VOLCENGINE) DEEPSEEK_R1 = ("deepseek-r1-250528", *_VOLCENGINE) DOUBAO_SEED_LITE = ("doubao-seed-2-0-lite-260215", *_VOLCENGINE) @@ -34,6 +33,7 @@ class LLM(Enum): GPT_4O = ("gpt-4o", *_CARDIAC) GPT_4O_MINI = ("gpt-4o-mini", *_CARDIAC) CLAUDE_SONNET_4 = ("claude-sonnet-4-20250514", *_CARDIAC) + GEMINI_FLASH = ("gemini-2.5-flash", *_CARDIAC) def __init__(self, model_name: str, api_key_fn, base_url_fn): self.model_name = model_name diff --git a/app/ai/resume_extractor/__init__.py b/app/ai/resume_extractor/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/app/ai/resume_extractor/__init__.py @@ -0,0 +1 @@ + diff --git a/app/ai/resume_extractor/extractor.py b/app/ai/resume_extractor/extractor.py new file mode 100644 index 0000000..e50f65e --- /dev/null +++ b/app/ai/resume_extractor/extractor.py @@ -0,0 +1,61 @@ +"""简历并行提取:将完整简历文本拆分为5个AI任务并行提取""" + +import asyncio + +from langchain_core.output_parsers import JsonOutputParser +from langchain_core.prompts import ChatPromptTemplate + +from app.ai.models import LLM +from app.ai.resume_extractor.prompts import ( + PROFILE_PROMPT, EDUCATION_PROMPT, WORK_PROMPT, + PROJECT_PROMPT, COMPETITION_PROMPT, +) +from app.core.logger import log + + +def _build_chain(prompt: str): + """构建单个提取链:prompt → LLM → JSON解析""" + return ( + ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")]) + | LLM.DOUBAO_PRO_32K.create(temperature=0) + | JsonOutputParser() + ) + + +# 5 条独立的提取链 +_profile_chain = _build_chain(PROFILE_PROMPT) +_education_chain = _build_chain(EDUCATION_PROMPT) +_work_chain = _build_chain(WORK_PROMPT) +_project_chain = _build_chain(PROJECT_PROMPT) +_competition_chain = _build_chain(COMPETITION_PROMPT) + + +async def extract_all(text: str) -> dict: + """asyncio.gather 并行提取简历所有模块,返回合并后的结构化数据""" + log.info("开始5路并行AI提取") + inp = {"text": text} + + profile, education, work_intern, project, competition = await asyncio.gather( + _safe_invoke(_profile_chain, inp, "个人信息"), + _safe_invoke(_education_chain, inp, "教育经历"), + _safe_invoke(_work_chain, inp, "工作+实习经历"), + _safe_invoke(_project_chain, inp, "项目经历"), + _safe_invoke(_competition_chain, inp, "竞赛经历"), + ) + + result = profile if isinstance(profile, dict) else {} + result["education"] = education if isinstance(education, list) else [] + result["work"] = work_intern.get("work", []) if isinstance(work_intern, dict) else [] + result["internship"] = work_intern.get("internship", []) if isinstance(work_intern, dict) else [] + result["project"] = project if isinstance(project, list) else [] + result["competition"] = competition if isinstance(competition, list) else [] + return result + + +async def _safe_invoke(chain, inp: dict, label: str): + """单个链调用,失败返回空""" + try: + return await chain.ainvoke(inp) + except Exception as e: + log.warning(f"AI提取[{label}]失败: {e}") + return {} if "个人信息" in label else [] diff --git a/app/ai/resume_extractor/prompts.py b/app/ai/resume_extractor/prompts.py new file mode 100644 index 0000000..81b6061 --- /dev/null +++ b/app/ai/resume_extractor/prompts.py @@ -0,0 +1,79 @@ +"""简历各模块提取的 System Prompt + +注意:prompt 中的 JSON 示例花括号必须用 {{ }} 转义,避免被 ChatPromptTemplate 当作变量。 +""" + +PROFILE_PROMPT = """从简历文本中仅提取个人基本信息,原文提取不要改写,输出JSON: +```json +{{ + "name": "姓名", + "email": "邮箱", + "mobileNumber": "手机号", + "city": "所在城市", + "wechatNumber": "微信号", + "portfolioUrl": "作品集链接", + "skills": ["技能1"], + "certificates": ["证书1"], + "summary": "个人概述原文" +}} +``` +规则:只提取个人信息,不提取经历内容。summary只填"自我评价/个人概述"原文。没有的填null,数组填[]。只输出JSON。""" + +EDUCATION_PROMPT = """从简历文本中仅提取教育经历,原文提取不要改写,输出JSON数组: +```json +[{{ + "school": "学校", + "major": "专业", + "degree": "学历", + "studyType": "全日制/非全日制", + "startDate": "2020.09", + "endDate": "2024.06", + "description": ["原文段落"] +}}] +``` +规则:只提取教育经历,不提取工作/实习/项目/竞赛。时间格式YYYY.MM。没有输出[]。只输出JSON。""" + +WORK_PROMPT = """从简历文本中仅提取工作经历和实习经历,原文提取不要改写,输出JSON: +```json +{{ + "work": [{{ + "companyName": "公司", + "position": "职位", + "startDate": "2024.07", + "endDate": "2025.03", + "description": ["原文段落"] + }}], + "internship": [{{ + "companyName": "公司", + "position": "职位", + "startDate": "2023.06", + "endDate": "2023.09", + "description": ["原文段落"] + }}] +}} +``` +规则:标注"实习"的归internship,其余归work。不提取项目/教育/竞赛。时间格式YYYY.MM。没有填[]。只输出JSON。""" + +PROJECT_PROMPT = """从简历文本中仅提取项目经历,原文提取不要改写,输出JSON数组: +```json +[{{ + "companyName": "所属公司", + "projectName": "项目名", + "role": "角色名称(如:后端开发、项目经理、前端工程师,只填角色名不填职责描述)", + "startDate": "2023.03", + "endDate": "2023.12", + "description": ["原文段落"] +}}] +``` +规则:只提取项目经历,不提取工作/实习/教育/竞赛。role只填简短角色名,职责内容放description。时间格式YYYY.MM。没有输出[]。只输出JSON。""" + +COMPETITION_PROMPT = """从简历文本中仅提取竞赛/获奖经历,原文提取不要改写,输出JSON数组: +```json +[{{ + "competitionName": "竞赛名", + "award": "获奖情况", + "awardDate": "2023.07", + "description": ["原文段落"] +}}] +``` +规则:只提取竞赛获奖,不提取其他经历。时间格式YYYY.MM。没有输出[]。只输出JSON。""" \ No newline at end of file diff --git a/app/services/resume_parse_service.py b/app/services/resume_parse_service.py index cda25ef..bd25cd1 100644 --- a/app/services/resume_parse_service.py +++ b/app/services/resume_parse_service.py @@ -1,18 +1,16 @@ """简历解析 Service -上传简历文件 → 解析为纯文本 → AI 结构化 → 写入数据库。 -依赖:file_parser(文件解析工具)、LLM(AI模型) +上传简历文件 → 解析为纯文本 → AI 并行结构化 → 写入数据库。 +依赖:file_parser(文件解析工具)、resume_extractor(AI并行提取) 使用表:bg_user_resume(主表)、bg_user_resume_education/work/internship/project/competition(5张子表) """ import asyncio -import json import shortuuid -from langchain_core.messages import SystemMessage, HumanMessage from sqlalchemy.ext.asyncio import AsyncSession -from app.ai.models import LLM +from app.ai.resume_extractor.extractor import extract_all from app.core.logger import log from app.models.user_resume import UserResume from app.models.user_resume_competition import UserResumeCompetition @@ -24,115 +22,26 @@ from app.tool.file_parser import parse_to_text from app.tool.snowflake import next_id -_SYSTEM_PROMPT = """你是一个专业的简历解析助手。请将用户提供的简历纯文本解析为结构化JSON。 - -输出格式要求(严格按此JSON结构输出,不要输出任何其他内容): -```json -{ - "name": "姓名", - "email": "邮箱", - "mobileNumber": "手机号", - "city": "所在城市", - "wechatNumber": "微信号(如有)", - "portfolioUrl": "作品集链接(如有)", - "skills": ["技能1", "技能2"], - "certificates": ["证书1", "证书2"], - "summary": "个人概述/自我评价", - "education": [ - { - "school": "学校名称", - "major": "专业", - "degree": "学历(大专/本科/硕士/博士)", - "studyType": "学习形式(全日制/非全日制)", - "startDate": "2020.09", - "endDate": "2024.06", - "description": ["描述段落1", "描述段落2"] - } - ], - "work": [ - { - "companyName": "公司名称", - "position": "职位", - "startDate": "2024.07", - "endDate": "2025.03", - "description": ["工作描述段落1", "工作描述段落2"] - } - ], - "internship": [ - { - "companyName": "公司名称", - "position": "实习职位", - "startDate": "2023.06", - "endDate": "2023.09", - "description": ["实习描述段落1"] - } - ], - "project": [ - { - "companyName": "所属公司(如有)", - "projectName": "项目名称", - "role": "担任角色", - "startDate": "2023.03", - "endDate": "2023.12", - "description": ["项目描述段落1"] - } - ], - "competition": [ - { - "competitionName": "竞赛名称", - "award": "获奖情况", - "awardDate": "2023.07", - "description": ["竞赛描述段落1"] - } - ] -} -``` - -规则: -1. 时间格式统一为 YYYY.MM(如 2023.09),如果只有年份则写 YYYY.01 -2. 没有的字段填 null,没有的数组填 [] -3. description 是字符串数组,每个元素是一个描述段落 -4. 区分工作经历和实习经历:明确标注"实习"的归入 internship,其余归入 work -5. 只输出 JSON,不要输出任何解释文字""" - - class ResumeParseService: async def parse_and_extract(self, filename: str, content: bytes) -> dict: - """文件解析 + AI 结构化,不涉及数据库操作""" - - # 1. 文件解析为纯文本(同步操作丢线程池) + """文件解析 + AI 并行结构化,不涉及数据库操作""" log.info(f"开始解析简历文件: {filename}") text = await asyncio.to_thread(parse_to_text, filename, content) if not text or not text.strip(): raise ValueError("文件内容为空,无法解析") log.info(f"文件解析完成,文本长度: {len(text)}") - # 2. AI 结构化解析 - log.info("开始AI结构化解析") - parsed = await self._ai_parse(text) - log.info("AI结构化解析完成") + log.info("开始AI并行结构化提取") + parsed = await extract_all(text) + log.info("AI并行结构化提取完成") return parsed - async def _ai_parse(self, text: str) -> dict: - """调用 AI 将纯文本解析为结构化 JSON""" - llm = LLM.DOUBAO_SEED_LITE.create(temperature=0) - messages = [SystemMessage(content=_SYSTEM_PROMPT), HumanMessage(content=text)] - response = await llm.ainvoke(messages) - - # 提取 JSON(兼容 markdown 代码块包裹) - raw = response.content.strip() - if raw.startswith("```"): - raw = raw.split("\n", 1)[1] if "\n" in raw else raw[3:] - raw = raw.rsplit("```", 1)[0] - return json.loads(raw) - async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int: """将解析结果写入主表 + 5张子表,返回简历ID""" resume_id = next_id() - # 主表 - resume = UserResume( + session.add(UserResume( id=resume_id, user_id=user_id, resume_name=filename.rsplit(".", 1)[0], target_position=None, is_default=0, sort_order=0, @@ -141,59 +50,48 @@ class ResumeParseService: wechat_number=parsed.get("wechatNumber"), portfolio_url=parsed.get("portfolioUrl"), skills=parsed.get("skills") or [], certificates=parsed.get("certificates") or [], summary=parsed.get("summary"), - ) - session.add(resume) + )) - # 教育经历 for i, edu in enumerate(parsed.get("education") or []): session.add(UserResumeEducation( id=next_id(), resume_id=resume_id, user_id=user_id, school=edu.get("school"), major=edu.get("major"), degree=edu.get("degree"), study_type=edu.get("studyType"), start_date=edu.get("startDate"), end_date=edu.get("endDate"), - description=_to_description_paragraphs(edu.get("description")), - sort_order=i, + description=_to_paragraphs(edu.get("description")), sort_order=i, )) - # 工作经历 for i, work in enumerate(parsed.get("work") or []): session.add(UserResumeWork( id=next_id(), resume_id=resume_id, user_id=user_id, company_name=work.get("companyName"), position=work.get("position"), start_date=work.get("startDate"), end_date=work.get("endDate"), - description=_to_description_paragraphs(work.get("description")), - sort_order=i, + description=_to_paragraphs(work.get("description")), sort_order=i, )) - # 实习经历 for i, intern in enumerate(parsed.get("internship") or []): session.add(UserResumeInternship( id=next_id(), resume_id=resume_id, user_id=user_id, company_name=intern.get("companyName"), position=intern.get("position"), start_date=intern.get("startDate"), end_date=intern.get("endDate"), - description=_to_description_paragraphs(intern.get("description")), - sort_order=i, + description=_to_paragraphs(intern.get("description")), sort_order=i, )) - # 项目经历 for i, proj in enumerate(parsed.get("project") or []): session.add(UserResumeProject( id=next_id(), resume_id=resume_id, user_id=user_id, company_name=proj.get("companyName"), project_name=proj.get("projectName"), role=proj.get("role"), start_date=proj.get("startDate"), end_date=proj.get("endDate"), - description=_to_description_paragraphs(proj.get("description")), - sort_order=i, + description=_to_paragraphs(proj.get("description")), sort_order=i, )) - # 竞赛经历 for i, comp in enumerate(parsed.get("competition") or []): session.add(UserResumeCompetition( id=next_id(), resume_id=resume_id, user_id=user_id, competition_name=comp.get("competitionName"), award=comp.get("award"), award_date=comp.get("awardDate"), - description=_to_description_paragraphs(comp.get("description")), - sort_order=i, + description=_to_paragraphs(comp.get("description")), sort_order=i, )) await session.flush() @@ -201,7 +99,7 @@ class ResumeParseService: return resume_id -def _to_description_paragraphs(texts: list[str] | None) -> list[dict] | None: +def _to_paragraphs(texts: list[str] | None) -> list[dict] | None: """将字符串数组转为 [{id, text}] 格式的描述段落""" if not texts: return None