"""简历解析 Service 上传简历文件 → 解析为纯文本 → AI 结构化 → 写入数据库。 依赖:file_parser(文件解析工具)、LLM(AI模型) 使用表:bg_user_resume(主表)、bg_user_resume_education/work/internship/project/competition(5张子表) """ import asyncio import json import shortuuid from langchain_core.messages import SystemMessage, HumanMessage from sqlalchemy.ext.asyncio import AsyncSession from app.ai.models import LLM from app.core.logger import log from app.models.user_resume import UserResume from app.models.user_resume_competition import UserResumeCompetition from app.models.user_resume_education import UserResumeEducation from app.models.user_resume_internship import UserResumeInternship from app.models.user_resume_project import UserResumeProject from app.models.user_resume_work import UserResumeWork from app.tool.file_parser import parse_to_text from app.tool.snowflake import next_id _SYSTEM_PROMPT = """你是一个专业的简历解析助手。请将用户提供的简历纯文本解析为结构化JSON。 输出格式要求(严格按此JSON结构输出,不要输出任何其他内容): ```json { "name": "姓名", "email": "邮箱", "mobileNumber": "手机号", "city": "所在城市", "wechatNumber": "微信号(如有)", "portfolioUrl": "作品集链接(如有)", "skills": ["技能1", "技能2"], "certificates": ["证书1", "证书2"], "summary": "个人概述/自我评价", "education": [ { "school": "学校名称", "major": "专业", "degree": "学历(大专/本科/硕士/博士)", "studyType": "学习形式(全日制/非全日制)", "startDate": "2020.09", "endDate": "2024.06", "description": ["描述段落1", "描述段落2"] } ], "work": [ { "companyName": "公司名称", "position": "职位", "startDate": "2024.07", "endDate": "2025.03", "description": ["工作描述段落1", "工作描述段落2"] } ], "internship": [ { "companyName": "公司名称", "position": "实习职位", "startDate": "2023.06", "endDate": "2023.09", "description": ["实习描述段落1"] } ], "project": [ { "companyName": "所属公司(如有)", "projectName": "项目名称", "role": "担任角色", "startDate": "2023.03", "endDate": "2023.12", "description": ["项目描述段落1"] } ], "competition": [ { "competitionName": "竞赛名称", "award": "获奖情况", "awardDate": "2023.07", "description": ["竞赛描述段落1"] } ] } ``` 规则: 1. 时间格式统一为 YYYY.MM(如 2023.09),如果只有年份则写 YYYY.01 2. 没有的字段填 null,没有的数组填 [] 3. description 是字符串数组,每个元素是一个描述段落 4. 区分工作经历和实习经历:明确标注"实习"的归入 internship,其余归入 work 5. 只输出 JSON,不要输出任何解释文字""" class ResumeParseService: async def parse_and_extract(self, filename: str, content: bytes) -> dict: """文件解析 + AI 结构化,不涉及数据库操作""" # 1. 文件解析为纯文本(同步操作丢线程池) log.info(f"开始解析简历文件: {filename}") text = await asyncio.to_thread(parse_to_text, filename, content) if not text or not text.strip(): raise ValueError("文件内容为空,无法解析") log.info(f"文件解析完成,文本长度: {len(text)}") # 2. AI 结构化解析 log.info("开始AI结构化解析") parsed = await self._ai_parse(text) log.info("AI结构化解析完成") return parsed async def _ai_parse(self, text: str) -> dict: """调用 AI 将纯文本解析为结构化 JSON""" llm = LLM.DOUBAO_SEED_PRO.create(temperature=0) messages = [SystemMessage(content=_SYSTEM_PROMPT), HumanMessage(content=text)] response = await llm.ainvoke(messages) # 提取 JSON(兼容 markdown 代码块包裹) raw = response.content.strip() if raw.startswith("```"): raw = raw.split("\n", 1)[1] if "\n" in raw else raw[3:] raw = raw.rsplit("```", 1)[0] return json.loads(raw) async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int: """将解析结果写入主表 + 5张子表,返回简历ID""" resume_id = next_id() # 主表 resume = UserResume( id=resume_id, user_id=user_id, resume_name=filename.rsplit(".", 1)[0], target_position=None, is_default=0, sort_order=0, name=parsed.get("name"), email=parsed.get("email"), mobile_number=parsed.get("mobileNumber"), city=parsed.get("city"), wechat_number=parsed.get("wechatNumber"), portfolio_url=parsed.get("portfolioUrl"), skills=parsed.get("skills") or [], certificates=parsed.get("certificates") or [], summary=parsed.get("summary"), ) session.add(resume) # 教育经历 for i, edu in enumerate(parsed.get("education") or []): session.add(UserResumeEducation( id=next_id(), resume_id=resume_id, user_id=user_id, school=edu.get("school"), major=edu.get("major"), degree=edu.get("degree"), study_type=edu.get("studyType"), start_date=edu.get("startDate"), end_date=edu.get("endDate"), description=_to_description_paragraphs(edu.get("description")), sort_order=i, )) # 工作经历 for i, work in enumerate(parsed.get("work") or []): session.add(UserResumeWork( id=next_id(), resume_id=resume_id, user_id=user_id, company_name=work.get("companyName"), position=work.get("position"), start_date=work.get("startDate"), end_date=work.get("endDate"), description=_to_description_paragraphs(work.get("description")), sort_order=i, )) # 实习经历 for i, intern in enumerate(parsed.get("internship") or []): session.add(UserResumeInternship( id=next_id(), resume_id=resume_id, user_id=user_id, company_name=intern.get("companyName"), position=intern.get("position"), start_date=intern.get("startDate"), end_date=intern.get("endDate"), description=_to_description_paragraphs(intern.get("description")), sort_order=i, )) # 项目经历 for i, proj in enumerate(parsed.get("project") or []): session.add(UserResumeProject( id=next_id(), resume_id=resume_id, user_id=user_id, company_name=proj.get("companyName"), project_name=proj.get("projectName"), role=proj.get("role"), start_date=proj.get("startDate"), end_date=proj.get("endDate"), description=_to_description_paragraphs(proj.get("description")), sort_order=i, )) # 竞赛经历 for i, comp in enumerate(parsed.get("competition") or []): session.add(UserResumeCompetition( id=next_id(), resume_id=resume_id, user_id=user_id, competition_name=comp.get("competitionName"), award=comp.get("award"), award_date=comp.get("awardDate"), description=_to_description_paragraphs(comp.get("description")), sort_order=i, )) await session.flush() log.info(f"简历保存完成,resumeId: {resume_id}") return resume_id def _to_description_paragraphs(texts: list[str] | None) -> list[dict] | None: """将字符串数组转为 [{id, text}] 格式的描述段落""" if not texts: return None return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t]