Files
offerpai_python_ai/app/services/resume_parse_service.py
T
2026-04-02 16:01:08 +08:00

208 lines
8.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""简历解析 Service
上传简历文件 → 解析为纯文本 → AI 结构化 → 写入数据库。
依赖:file_parser(文件解析工具)、LLM(AI模型)
使用表:bg_user_resume(主表)、bg_user_resume_education/work/internship/project/competition5张子表)
"""
import asyncio
import json
import shortuuid
from langchain_core.messages import SystemMessage, HumanMessage
from sqlalchemy.ext.asyncio import AsyncSession
from app.ai.models import LLM
from app.core.logger import log
from app.models.user_resume import UserResume
from app.models.user_resume_competition import UserResumeCompetition
from app.models.user_resume_education import UserResumeEducation
from app.models.user_resume_internship import UserResumeInternship
from app.models.user_resume_project import UserResumeProject
from app.models.user_resume_work import UserResumeWork
from app.tool.file_parser import parse_to_text
from app.tool.snowflake import next_id
_SYSTEM_PROMPT = """你是一个专业的简历解析助手。请将用户提供的简历纯文本解析为结构化JSON。
输出格式要求(严格按此JSON结构输出,不要输出任何其他内容):
```json
{
"name": "姓名",
"email": "邮箱",
"mobileNumber": "手机号",
"city": "所在城市",
"wechatNumber": "微信号(如有)",
"portfolioUrl": "作品集链接(如有)",
"skills": ["技能1", "技能2"],
"certificates": ["证书1", "证书2"],
"summary": "个人概述/自我评价",
"education": [
{
"school": "学校名称",
"major": "专业",
"degree": "学历(大专/本科/硕士/博士)",
"studyType": "学习形式(全日制/非全日制)",
"startDate": "2020.09",
"endDate": "2024.06",
"description": ["描述段落1", "描述段落2"]
}
],
"work": [
{
"companyName": "公司名称",
"position": "职位",
"startDate": "2024.07",
"endDate": "2025.03",
"description": ["工作描述段落1", "工作描述段落2"]
}
],
"internship": [
{
"companyName": "公司名称",
"position": "实习职位",
"startDate": "2023.06",
"endDate": "2023.09",
"description": ["实习描述段落1"]
}
],
"project": [
{
"companyName": "所属公司(如有)",
"projectName": "项目名称",
"role": "担任角色",
"startDate": "2023.03",
"endDate": "2023.12",
"description": ["项目描述段落1"]
}
],
"competition": [
{
"competitionName": "竞赛名称",
"award": "获奖情况",
"awardDate": "2023.07",
"description": ["竞赛描述段落1"]
}
]
}
```
规则:
1. 时间格式统一为 YYYY.MM(如 2023.09),如果只有年份则写 YYYY.01
2. 没有的字段填 null,没有的数组填 []
3. description 是字符串数组,每个元素是一个描述段落
4. 区分工作经历和实习经历:明确标注"实习"的归入 internship,其余归入 work
5. 只输出 JSON,不要输出任何解释文字"""
class ResumeParseService:
async def parse_and_extract(self, filename: str, content: bytes) -> dict:
"""文件解析 + AI 结构化,不涉及数据库操作"""
# 1. 文件解析为纯文本(同步操作丢线程池)
log.info(f"开始解析简历文件: {filename}")
text = await asyncio.to_thread(parse_to_text, filename, content)
if not text or not text.strip():
raise ValueError("文件内容为空,无法解析")
log.info(f"文件解析完成,文本长度: {len(text)}")
# 2. AI 结构化解析
log.info("开始AI结构化解析")
parsed = await self._ai_parse(text)
log.info("AI结构化解析完成")
return parsed
async def _ai_parse(self, text: str) -> dict:
"""调用 AI 将纯文本解析为结构化 JSON"""
llm = LLM.DOUBAO_SEED_PRO.create(temperature=0)
messages = [SystemMessage(content=_SYSTEM_PROMPT), HumanMessage(content=text)]
response = await llm.ainvoke(messages)
# 提取 JSON(兼容 markdown 代码块包裹)
raw = response.content.strip()
if raw.startswith("```"):
raw = raw.split("\n", 1)[1] if "\n" in raw else raw[3:]
raw = raw.rsplit("```", 1)[0]
return json.loads(raw)
async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int:
"""将解析结果写入主表 + 5张子表,返回简历ID"""
resume_id = next_id()
# 主表
resume = UserResume(
id=resume_id, user_id=user_id,
resume_name=filename.rsplit(".", 1)[0],
target_position=None, is_default=0, sort_order=0,
name=parsed.get("name"), email=parsed.get("email"),
mobile_number=parsed.get("mobileNumber"), city=parsed.get("city"),
wechat_number=parsed.get("wechatNumber"), portfolio_url=parsed.get("portfolioUrl"),
skills=parsed.get("skills") or [], certificates=parsed.get("certificates") or [],
summary=parsed.get("summary"),
)
session.add(resume)
# 教育经历
for i, edu in enumerate(parsed.get("education") or []):
session.add(UserResumeEducation(
id=next_id(), resume_id=resume_id, user_id=user_id,
school=edu.get("school"), major=edu.get("major"),
degree=edu.get("degree"), study_type=edu.get("studyType"),
start_date=edu.get("startDate"), end_date=edu.get("endDate"),
description=_to_description_paragraphs(edu.get("description")),
sort_order=i,
))
# 工作经历
for i, work in enumerate(parsed.get("work") or []):
session.add(UserResumeWork(
id=next_id(), resume_id=resume_id, user_id=user_id,
company_name=work.get("companyName"), position=work.get("position"),
start_date=work.get("startDate"), end_date=work.get("endDate"),
description=_to_description_paragraphs(work.get("description")),
sort_order=i,
))
# 实习经历
for i, intern in enumerate(parsed.get("internship") or []):
session.add(UserResumeInternship(
id=next_id(), resume_id=resume_id, user_id=user_id,
company_name=intern.get("companyName"), position=intern.get("position"),
start_date=intern.get("startDate"), end_date=intern.get("endDate"),
description=_to_description_paragraphs(intern.get("description")),
sort_order=i,
))
# 项目经历
for i, proj in enumerate(parsed.get("project") or []):
session.add(UserResumeProject(
id=next_id(), resume_id=resume_id, user_id=user_id,
company_name=proj.get("companyName"), project_name=proj.get("projectName"),
role=proj.get("role"),
start_date=proj.get("startDate"), end_date=proj.get("endDate"),
description=_to_description_paragraphs(proj.get("description")),
sort_order=i,
))
# 竞赛经历
for i, comp in enumerate(parsed.get("competition") or []):
session.add(UserResumeCompetition(
id=next_id(), resume_id=resume_id, user_id=user_id,
competition_name=comp.get("competitionName"), award=comp.get("award"),
award_date=comp.get("awardDate"),
description=_to_description_paragraphs(comp.get("description")),
sort_order=i,
))
await session.flush()
log.info(f"简历保存完成,resumeId: {resume_id}")
return resume_id
def _to_description_paragraphs(texts: list[str] | None) -> list[dict] | None:
"""将字符串数组转为 [{id, text}] 格式的描述段落"""
if not texts:
return None
return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t]