修改简历上传为多路上传
This commit is contained in:
@@ -31,7 +31,10 @@ offerpie_python_ai/
|
||||
│ └─ responses.py # 统一响应模型 StandardResponse(code/msg/data/timestamp/uuid)
|
||||
│
|
||||
├─ ai/ # **AI 能力层**
|
||||
│ └─ models.py # LLM 模型枚举(LLM.DOUBAO_PRO_256K、DEEPSEEK_V3、GPT_4O 等),基于 LangChain ChatOpenAI
|
||||
│ ├─ models.py # LLM 模型枚举(LLM.DOUBAO_PRO_256K、DEEPSEEK_V3、GPT_4O 等),基于 LangChain ChatOpenAI
|
||||
│ └─ resume_extractor/ # 简历 AI 提取模块
|
||||
│ ├─ prompts.py # 5 个提取任务的 System Prompt(个人信息/教育/工作+实习/项目/竞赛)
|
||||
│ └─ extractor.py # AI 并行提取(extract_all 入口,asyncio.gather 5 路并行)
|
||||
│
|
||||
├─ api/ # **路由层**(REST API 接口)
|
||||
│ ├─ health.py # 健康检查接口 GET /health/
|
||||
@@ -62,7 +65,7 @@ offerpie_python_ai/
|
||||
|------|----------|-------------|
|
||||
| **config** | 统一配置管理,基于 Pydantic Settings,支持 .env 文件加载 | `Settings`(数据库、Redis、LLM供应商、JWT、CORS、日志等全部配置项) |
|
||||
| **core** | 核心基础设施:数据库连接、Redis连接、鉴权、日志、中间件、异常处理、统一响应 | `database.py`、`redis.py`、`auth.py`、`middleware.py`、`exceptions.py`、`logger.py`、`StandardResponse` |
|
||||
| **ai** | AI 模型管理,封装多供应商 LLM 实例创建,基于 LangChain ChatOpenAI | `LLM` 枚举(火山引擎:doubao/deepseek,心缘:gpt-4o/claude) |
|
||||
| **ai** | AI 模型管理 + 业务 AI 能力 | `LLM` 枚举、`resume_extractor/`(简历并行提取:5路 AI 同时提取个人信息/教育/工作+实习/项目/竞赛) |
|
||||
| **api** | REST API 路由定义 | `health.py`(健康检查)、`resume.py`(简历上传解析) |
|
||||
| **models** | SQLAlchemy ORM 模型,与 Java 端共享同一数据库 | `FuncPermission`、`UserFuncPermissionStock`、`UserFuncUsageLog`、`UserResume`、`UserResumeEducation`/`Work`/`Internship`/`Project`/`Competition` |
|
||||
| **tool** | 无状态通用工具,不依赖数据库/Redis/用户上下文 | `file_parser.py`(PDF/Word/TXT 文件解析为纯文本)、`snowflake.py`(雪花ID生成) |
|
||||
|
||||
+3
-3
@@ -22,9 +22,8 @@ class LLM(Enum):
|
||||
"""所有可用模型,每个枚举值 = (模型名, api_key函数, base_url函数)"""
|
||||
|
||||
# 火山引擎
|
||||
DOUBAO_PRO_256K = ("doubao-pro-256k", *_VOLCENGINE)
|
||||
DOUBAO_PRO_32K = ("doubao-pro-32k", *_VOLCENGINE)
|
||||
DOUBAO_LITE_128K = ("doubao-lite-128k", *_VOLCENGINE)
|
||||
|
||||
DOUBAO_PRO_32K = ("doubao-1-5-pro-32k-250115", *_VOLCENGINE)
|
||||
DEEPSEEK_V3 = ("deepseek-v3-250324", *_VOLCENGINE)
|
||||
DEEPSEEK_R1 = ("deepseek-r1-250528", *_VOLCENGINE)
|
||||
DOUBAO_SEED_LITE = ("doubao-seed-2-0-lite-260215", *_VOLCENGINE)
|
||||
@@ -34,6 +33,7 @@ class LLM(Enum):
|
||||
GPT_4O = ("gpt-4o", *_CARDIAC)
|
||||
GPT_4O_MINI = ("gpt-4o-mini", *_CARDIAC)
|
||||
CLAUDE_SONNET_4 = ("claude-sonnet-4-20250514", *_CARDIAC)
|
||||
GEMINI_FLASH = ("gemini-2.5-flash", *_CARDIAC)
|
||||
|
||||
def __init__(self, model_name: str, api_key_fn, base_url_fn):
|
||||
self.model_name = model_name
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
"""简历并行提取:将完整简历文本拆分为5个AI任务并行提取"""
|
||||
|
||||
import asyncio
|
||||
|
||||
from langchain_core.output_parsers import JsonOutputParser
|
||||
from langchain_core.prompts import ChatPromptTemplate
|
||||
|
||||
from app.ai.models import LLM
|
||||
from app.ai.resume_extractor.prompts import (
|
||||
PROFILE_PROMPT, EDUCATION_PROMPT, WORK_PROMPT,
|
||||
PROJECT_PROMPT, COMPETITION_PROMPT,
|
||||
)
|
||||
from app.core.logger import log
|
||||
|
||||
|
||||
def _build_chain(prompt: str):
|
||||
"""构建单个提取链:prompt → LLM → JSON解析"""
|
||||
return (
|
||||
ChatPromptTemplate.from_messages([("system", prompt), ("human", "{text}")])
|
||||
| LLM.DOUBAO_PRO_32K.create(temperature=0)
|
||||
| JsonOutputParser()
|
||||
)
|
||||
|
||||
|
||||
# 5 条独立的提取链
|
||||
_profile_chain = _build_chain(PROFILE_PROMPT)
|
||||
_education_chain = _build_chain(EDUCATION_PROMPT)
|
||||
_work_chain = _build_chain(WORK_PROMPT)
|
||||
_project_chain = _build_chain(PROJECT_PROMPT)
|
||||
_competition_chain = _build_chain(COMPETITION_PROMPT)
|
||||
|
||||
|
||||
async def extract_all(text: str) -> dict:
|
||||
"""asyncio.gather 并行提取简历所有模块,返回合并后的结构化数据"""
|
||||
log.info("开始5路并行AI提取")
|
||||
inp = {"text": text}
|
||||
|
||||
profile, education, work_intern, project, competition = await asyncio.gather(
|
||||
_safe_invoke(_profile_chain, inp, "个人信息"),
|
||||
_safe_invoke(_education_chain, inp, "教育经历"),
|
||||
_safe_invoke(_work_chain, inp, "工作+实习经历"),
|
||||
_safe_invoke(_project_chain, inp, "项目经历"),
|
||||
_safe_invoke(_competition_chain, inp, "竞赛经历"),
|
||||
)
|
||||
|
||||
result = profile if isinstance(profile, dict) else {}
|
||||
result["education"] = education if isinstance(education, list) else []
|
||||
result["work"] = work_intern.get("work", []) if isinstance(work_intern, dict) else []
|
||||
result["internship"] = work_intern.get("internship", []) if isinstance(work_intern, dict) else []
|
||||
result["project"] = project if isinstance(project, list) else []
|
||||
result["competition"] = competition if isinstance(competition, list) else []
|
||||
return result
|
||||
|
||||
|
||||
async def _safe_invoke(chain, inp: dict, label: str):
|
||||
"""单个链调用,失败返回空"""
|
||||
try:
|
||||
return await chain.ainvoke(inp)
|
||||
except Exception as e:
|
||||
log.warning(f"AI提取[{label}]失败: {e}")
|
||||
return {} if "个人信息" in label else []
|
||||
@@ -0,0 +1,79 @@
|
||||
"""简历各模块提取的 System Prompt
|
||||
|
||||
注意:prompt 中的 JSON 示例花括号必须用 {{ }} 转义,避免被 ChatPromptTemplate 当作变量。
|
||||
"""
|
||||
|
||||
PROFILE_PROMPT = """从简历文本中仅提取个人基本信息,原文提取不要改写,输出JSON:
|
||||
```json
|
||||
{{
|
||||
"name": "姓名",
|
||||
"email": "邮箱",
|
||||
"mobileNumber": "手机号",
|
||||
"city": "所在城市",
|
||||
"wechatNumber": "微信号",
|
||||
"portfolioUrl": "作品集链接",
|
||||
"skills": ["技能1"],
|
||||
"certificates": ["证书1"],
|
||||
"summary": "个人概述原文"
|
||||
}}
|
||||
```
|
||||
规则:只提取个人信息,不提取经历内容。summary只填"自我评价/个人概述"原文。没有的填null,数组填[]。只输出JSON。"""
|
||||
|
||||
EDUCATION_PROMPT = """从简历文本中仅提取教育经历,原文提取不要改写,输出JSON数组:
|
||||
```json
|
||||
[{{
|
||||
"school": "学校",
|
||||
"major": "专业",
|
||||
"degree": "学历",
|
||||
"studyType": "全日制/非全日制",
|
||||
"startDate": "2020.09",
|
||||
"endDate": "2024.06",
|
||||
"description": ["原文段落"]
|
||||
}}]
|
||||
```
|
||||
规则:只提取教育经历,不提取工作/实习/项目/竞赛。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
|
||||
|
||||
WORK_PROMPT = """从简历文本中仅提取工作经历和实习经历,原文提取不要改写,输出JSON:
|
||||
```json
|
||||
{{
|
||||
"work": [{{
|
||||
"companyName": "公司",
|
||||
"position": "职位",
|
||||
"startDate": "2024.07",
|
||||
"endDate": "2025.03",
|
||||
"description": ["原文段落"]
|
||||
}}],
|
||||
"internship": [{{
|
||||
"companyName": "公司",
|
||||
"position": "职位",
|
||||
"startDate": "2023.06",
|
||||
"endDate": "2023.09",
|
||||
"description": ["原文段落"]
|
||||
}}]
|
||||
}}
|
||||
```
|
||||
规则:标注"实习"的归internship,其余归work。不提取项目/教育/竞赛。时间格式YYYY.MM。没有填[]。只输出JSON。"""
|
||||
|
||||
PROJECT_PROMPT = """从简历文本中仅提取项目经历,原文提取不要改写,输出JSON数组:
|
||||
```json
|
||||
[{{
|
||||
"companyName": "所属公司",
|
||||
"projectName": "项目名",
|
||||
"role": "角色名称(如:后端开发、项目经理、前端工程师,只填角色名不填职责描述)",
|
||||
"startDate": "2023.03",
|
||||
"endDate": "2023.12",
|
||||
"description": ["原文段落"]
|
||||
}}]
|
||||
```
|
||||
规则:只提取项目经历,不提取工作/实习/教育/竞赛。role只填简短角色名,职责内容放description。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
|
||||
|
||||
COMPETITION_PROMPT = """从简历文本中仅提取竞赛/获奖经历,原文提取不要改写,输出JSON数组:
|
||||
```json
|
||||
[{{
|
||||
"competitionName": "竞赛名",
|
||||
"award": "获奖情况",
|
||||
"awardDate": "2023.07",
|
||||
"description": ["原文段落"]
|
||||
}}]
|
||||
```
|
||||
规则:只提取竞赛获奖,不提取其他经历。时间格式YYYY.MM。没有输出[]。只输出JSON。"""
|
||||
@@ -1,18 +1,16 @@
|
||||
"""简历解析 Service
|
||||
|
||||
上传简历文件 → 解析为纯文本 → AI 结构化 → 写入数据库。
|
||||
依赖:file_parser(文件解析工具)、LLM(AI模型)
|
||||
上传简历文件 → 解析为纯文本 → AI 并行结构化 → 写入数据库。
|
||||
依赖:file_parser(文件解析工具)、resume_extractor(AI并行提取)
|
||||
使用表:bg_user_resume(主表)、bg_user_resume_education/work/internship/project/competition(5张子表)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import shortuuid
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.ai.models import LLM
|
||||
from app.ai.resume_extractor.extractor import extract_all
|
||||
from app.core.logger import log
|
||||
from app.models.user_resume import UserResume
|
||||
from app.models.user_resume_competition import UserResumeCompetition
|
||||
@@ -24,115 +22,26 @@ from app.tool.file_parser import parse_to_text
|
||||
from app.tool.snowflake import next_id
|
||||
|
||||
|
||||
_SYSTEM_PROMPT = """你是一个专业的简历解析助手。请将用户提供的简历纯文本解析为结构化JSON。
|
||||
|
||||
输出格式要求(严格按此JSON结构输出,不要输出任何其他内容):
|
||||
```json
|
||||
{
|
||||
"name": "姓名",
|
||||
"email": "邮箱",
|
||||
"mobileNumber": "手机号",
|
||||
"city": "所在城市",
|
||||
"wechatNumber": "微信号(如有)",
|
||||
"portfolioUrl": "作品集链接(如有)",
|
||||
"skills": ["技能1", "技能2"],
|
||||
"certificates": ["证书1", "证书2"],
|
||||
"summary": "个人概述/自我评价",
|
||||
"education": [
|
||||
{
|
||||
"school": "学校名称",
|
||||
"major": "专业",
|
||||
"degree": "学历(大专/本科/硕士/博士)",
|
||||
"studyType": "学习形式(全日制/非全日制)",
|
||||
"startDate": "2020.09",
|
||||
"endDate": "2024.06",
|
||||
"description": ["描述段落1", "描述段落2"]
|
||||
}
|
||||
],
|
||||
"work": [
|
||||
{
|
||||
"companyName": "公司名称",
|
||||
"position": "职位",
|
||||
"startDate": "2024.07",
|
||||
"endDate": "2025.03",
|
||||
"description": ["工作描述段落1", "工作描述段落2"]
|
||||
}
|
||||
],
|
||||
"internship": [
|
||||
{
|
||||
"companyName": "公司名称",
|
||||
"position": "实习职位",
|
||||
"startDate": "2023.06",
|
||||
"endDate": "2023.09",
|
||||
"description": ["实习描述段落1"]
|
||||
}
|
||||
],
|
||||
"project": [
|
||||
{
|
||||
"companyName": "所属公司(如有)",
|
||||
"projectName": "项目名称",
|
||||
"role": "担任角色",
|
||||
"startDate": "2023.03",
|
||||
"endDate": "2023.12",
|
||||
"description": ["项目描述段落1"]
|
||||
}
|
||||
],
|
||||
"competition": [
|
||||
{
|
||||
"competitionName": "竞赛名称",
|
||||
"award": "获奖情况",
|
||||
"awardDate": "2023.07",
|
||||
"description": ["竞赛描述段落1"]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
规则:
|
||||
1. 时间格式统一为 YYYY.MM(如 2023.09),如果只有年份则写 YYYY.01
|
||||
2. 没有的字段填 null,没有的数组填 []
|
||||
3. description 是字符串数组,每个元素是一个描述段落
|
||||
4. 区分工作经历和实习经历:明确标注"实习"的归入 internship,其余归入 work
|
||||
5. 只输出 JSON,不要输出任何解释文字"""
|
||||
|
||||
|
||||
class ResumeParseService:
|
||||
|
||||
async def parse_and_extract(self, filename: str, content: bytes) -> dict:
|
||||
"""文件解析 + AI 结构化,不涉及数据库操作"""
|
||||
|
||||
# 1. 文件解析为纯文本(同步操作丢线程池)
|
||||
"""文件解析 + AI 并行结构化,不涉及数据库操作"""
|
||||
log.info(f"开始解析简历文件: {filename}")
|
||||
text = await asyncio.to_thread(parse_to_text, filename, content)
|
||||
if not text or not text.strip():
|
||||
raise ValueError("文件内容为空,无法解析")
|
||||
log.info(f"文件解析完成,文本长度: {len(text)}")
|
||||
|
||||
# 2. AI 结构化解析
|
||||
log.info("开始AI结构化解析")
|
||||
parsed = await self._ai_parse(text)
|
||||
log.info("AI结构化解析完成")
|
||||
log.info("开始AI并行结构化提取")
|
||||
parsed = await extract_all(text)
|
||||
log.info("AI并行结构化提取完成")
|
||||
return parsed
|
||||
|
||||
async def _ai_parse(self, text: str) -> dict:
|
||||
"""调用 AI 将纯文本解析为结构化 JSON"""
|
||||
llm = LLM.DOUBAO_SEED_LITE.create(temperature=0)
|
||||
messages = [SystemMessage(content=_SYSTEM_PROMPT), HumanMessage(content=text)]
|
||||
response = await llm.ainvoke(messages)
|
||||
|
||||
# 提取 JSON(兼容 markdown 代码块包裹)
|
||||
raw = response.content.strip()
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("\n", 1)[1] if "\n" in raw else raw[3:]
|
||||
raw = raw.rsplit("```", 1)[0]
|
||||
return json.loads(raw)
|
||||
|
||||
async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int:
|
||||
"""将解析结果写入主表 + 5张子表,返回简历ID"""
|
||||
resume_id = next_id()
|
||||
|
||||
# 主表
|
||||
resume = UserResume(
|
||||
session.add(UserResume(
|
||||
id=resume_id, user_id=user_id,
|
||||
resume_name=filename.rsplit(".", 1)[0],
|
||||
target_position=None, is_default=0, sort_order=0,
|
||||
@@ -141,59 +50,48 @@ class ResumeParseService:
|
||||
wechat_number=parsed.get("wechatNumber"), portfolio_url=parsed.get("portfolioUrl"),
|
||||
skills=parsed.get("skills") or [], certificates=parsed.get("certificates") or [],
|
||||
summary=parsed.get("summary"),
|
||||
)
|
||||
session.add(resume)
|
||||
))
|
||||
|
||||
# 教育经历
|
||||
for i, edu in enumerate(parsed.get("education") or []):
|
||||
session.add(UserResumeEducation(
|
||||
id=next_id(), resume_id=resume_id, user_id=user_id,
|
||||
school=edu.get("school"), major=edu.get("major"),
|
||||
degree=edu.get("degree"), study_type=edu.get("studyType"),
|
||||
start_date=edu.get("startDate"), end_date=edu.get("endDate"),
|
||||
description=_to_description_paragraphs(edu.get("description")),
|
||||
sort_order=i,
|
||||
description=_to_paragraphs(edu.get("description")), sort_order=i,
|
||||
))
|
||||
|
||||
# 工作经历
|
||||
for i, work in enumerate(parsed.get("work") or []):
|
||||
session.add(UserResumeWork(
|
||||
id=next_id(), resume_id=resume_id, user_id=user_id,
|
||||
company_name=work.get("companyName"), position=work.get("position"),
|
||||
start_date=work.get("startDate"), end_date=work.get("endDate"),
|
||||
description=_to_description_paragraphs(work.get("description")),
|
||||
sort_order=i,
|
||||
description=_to_paragraphs(work.get("description")), sort_order=i,
|
||||
))
|
||||
|
||||
# 实习经历
|
||||
for i, intern in enumerate(parsed.get("internship") or []):
|
||||
session.add(UserResumeInternship(
|
||||
id=next_id(), resume_id=resume_id, user_id=user_id,
|
||||
company_name=intern.get("companyName"), position=intern.get("position"),
|
||||
start_date=intern.get("startDate"), end_date=intern.get("endDate"),
|
||||
description=_to_description_paragraphs(intern.get("description")),
|
||||
sort_order=i,
|
||||
description=_to_paragraphs(intern.get("description")), sort_order=i,
|
||||
))
|
||||
|
||||
# 项目经历
|
||||
for i, proj in enumerate(parsed.get("project") or []):
|
||||
session.add(UserResumeProject(
|
||||
id=next_id(), resume_id=resume_id, user_id=user_id,
|
||||
company_name=proj.get("companyName"), project_name=proj.get("projectName"),
|
||||
role=proj.get("role"),
|
||||
start_date=proj.get("startDate"), end_date=proj.get("endDate"),
|
||||
description=_to_description_paragraphs(proj.get("description")),
|
||||
sort_order=i,
|
||||
description=_to_paragraphs(proj.get("description")), sort_order=i,
|
||||
))
|
||||
|
||||
# 竞赛经历
|
||||
for i, comp in enumerate(parsed.get("competition") or []):
|
||||
session.add(UserResumeCompetition(
|
||||
id=next_id(), resume_id=resume_id, user_id=user_id,
|
||||
competition_name=comp.get("competitionName"), award=comp.get("award"),
|
||||
award_date=comp.get("awardDate"),
|
||||
description=_to_description_paragraphs(comp.get("description")),
|
||||
sort_order=i,
|
||||
description=_to_paragraphs(comp.get("description")), sort_order=i,
|
||||
))
|
||||
|
||||
await session.flush()
|
||||
@@ -201,7 +99,7 @@ class ResumeParseService:
|
||||
return resume_id
|
||||
|
||||
|
||||
def _to_description_paragraphs(texts: list[str] | None) -> list[dict] | None:
|
||||
def _to_paragraphs(texts: list[str] | None) -> list[dict] | None:
|
||||
"""将字符串数组转为 [{id, text}] 格式的描述段落"""
|
||||
if not texts:
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user