Files
offerpai_python_ai/app/services/resume_parse_service.py
T
2026-04-29 15:02:05 +08:00

107 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""简历解析 Service
上传简历文件 → 解析为纯文本 → AI 两阶段并行结构化 → 写入数据库。
依赖:file_parser(文件解析工具)、resume_extractorAI两阶段并行提取)
使用表:bg_user_resume(主表)、bg_user_resume_education/work/internship/project/competition5张子表)
"""
import asyncio
import shortuuid
from sqlalchemy.ext.asyncio import AsyncSession
from app.ai.resume_extractor.extractor import extract_all
from app.core.logger import log
from app.models.user_resume import UserResume
from app.models.user_resume_competition import UserResumeCompetition
from app.models.user_resume_education import UserResumeEducation
from app.models.user_resume_internship import UserResumeInternship
from app.models.user_resume_project import UserResumeProject
from app.models.user_resume_work import UserResumeWork
from app.tool.file_parser import parse_to_text
from app.tool.snowflake import next_id
class ResumeParseService:
async def parse_and_extract(self, filename: str, content: bytes) -> dict:
"""文件解析 + AI 两阶段并行结构化,不涉及数据库操作"""
log.info(f"开始解析简历文件: {filename}")
text = await asyncio.to_thread(parse_to_text, filename, content)
if not text or not text.strip():
raise ValueError("文件内容为空,无法解析")
log.info(f"文件解析完成,文本长度: {len(text)}")
log.info("开始AI两阶段并行结构化提取")
parsed = await extract_all(text)
log.info("AI两阶段并行结构化提取完成")
return parsed
async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int:
"""将解析结果写入主表 + 5张子表,返回简历ID"""
resume_id = next_id()
session.add(UserResume(
id=resume_id, user_id=user_id,
resume_name=filename.rsplit(".", 1)[0],
target_position=None, is_default=0, sort_order=0,
name=parsed.get("name"), email=parsed.get("email"),
mobile_number=parsed.get("mobileNumber"), city=parsed.get("city"),
wechat_number=parsed.get("wechatNumber"), portfolio_url=parsed.get("portfolioUrl"),
skills=parsed.get("skills") or [], certificates=parsed.get("certificates") or [],
summary=parsed.get("summary"),
))
for i, edu in enumerate(parsed.get("education") or []):
session.add(UserResumeEducation(
id=next_id(), resume_id=resume_id, user_id=user_id,
school=edu.get("school"), major=edu.get("major"),
degree=edu.get("degree"), study_type=edu.get("studyType"),
start_date=edu.get("startDate"), end_date=edu.get("endDate"),
description=_to_paragraphs(edu.get("description")), sort_order=i,
))
for i, work in enumerate(parsed.get("work") or []):
session.add(UserResumeWork(
id=next_id(), resume_id=resume_id, user_id=user_id,
company_name=work.get("companyName"), position=work.get("position"),
start_date=work.get("startDate"), end_date=work.get("endDate"),
description=_to_paragraphs(work.get("description")), sort_order=i,
))
for i, intern in enumerate(parsed.get("internship") or []):
session.add(UserResumeInternship(
id=next_id(), resume_id=resume_id, user_id=user_id,
company_name=intern.get("companyName"), position=intern.get("position"),
start_date=intern.get("startDate"), end_date=intern.get("endDate"),
description=_to_paragraphs(intern.get("description")), sort_order=i,
))
for i, proj in enumerate(parsed.get("project") or []):
session.add(UserResumeProject(
id=next_id(), resume_id=resume_id, user_id=user_id,
company_name=proj.get("companyName"), project_name=proj.get("projectName"),
role=proj.get("role"),
start_date=proj.get("startDate"), end_date=proj.get("endDate"),
description=_to_paragraphs(proj.get("description")), sort_order=i,
))
for i, comp in enumerate(parsed.get("competition") or []):
session.add(UserResumeCompetition(
id=next_id(), resume_id=resume_id, user_id=user_id,
competition_name=comp.get("competitionName"), award=comp.get("award"),
award_date=comp.get("awardDate"),
description=_to_paragraphs(comp.get("description")), sort_order=i,
))
await session.flush()
log.info(f"简历保存完成,resumeId: {resume_id}")
return resume_id
def _to_paragraphs(texts: list[str] | None) -> list[dict] | None:
"""将字符串数组转为 [{id, text}] 格式的描述段落"""
if not texts:
return None
return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t]