优化简历提取速度

2026-04-29 15:02:05 +08:00
parent 0a15e320f5
commit b26c0a7262
5 changed files with 247 additions and 127 deletions
@@ -1,7 +1,7 @@
 """简历解析 Service

-上传简历文件 → 解析为纯文本 → AI 并行结构化 → 写入数据库。
-依赖：file_parser（文件解析工具）、resume_extractor（AI并行提取）
+上传简历文件 → 解析为纯文本 → AI 两阶段并行结构化 → 写入数据库。
+依赖：file_parser（文件解析工具）、resume_extractor（AI两阶段并行提取）
 使用表：bg_user_resume（主表）、bg_user_resume_education/work/internship/project/competition（5张子表）
 """

@@ -25,16 +25,16 @@ from app.tool.snowflake import next_id
 class ResumeParseService:

    async def parse_and_extract(self, filename: str, content: bytes) -> dict:
-        """文件解析 + AI 并行结构化，不涉及数据库操作"""
+        """文件解析 + AI 两阶段并行结构化，不涉及数据库操作"""
        log.info(f"开始解析简历文件: {filename}")
        text = await asyncio.to_thread(parse_to_text, filename, content)
        if not text or not text.strip():
            raise ValueError("文件内容为空，无法解析")
        log.info(f"文件解析完成，文本长度: {len(text)}")

-        log.info("开始AI并行结构化提取")
+        log.info("开始AI两阶段并行结构化提取")
        parsed = await extract_all(text)
-        log.info("AI并行结构化提取完成")
+        log.info("AI两阶段并行结构化提取完成")
        return parsed

    async def save_resume(self, session: AsyncSession, user_id: int, filename: str, parsed: dict) -> int:
@@ -103,4 +103,4 @@ def _to_paragraphs(texts: list[str] | None) -> list[dict] | None:
    """将字符串数组转为 [{id, text}] 格式的描述段落"""
    if not texts:
        return None
-    return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t]
+    return [{"id": shortuuid.ShortUUID().random(length=8), "text": t} for t in texts if t]