优化简历提取速度

2026-04-29 15:02:05 +08:00
parent 0a15e320f5
commit b26c0a7262
5 changed files with 247 additions and 127 deletions
@@ -1,24 +1,25 @@
 """文件解析工具

 将上传的简历文件（PDF / Word / TXT）转换为纯文本字符串。
+PDF 使用 PyMuPDF (fitz) 按文本块提取，保持段落边界和阅读顺序。
 """

 import io

-import pdfplumber
+import fitz
 from docx import Document

 from app.core.logger import log


 def parse_pdf(content: bytes) -> str:
-    """解析 PDF 文件，提取全部页面文本"""
+    """解析 PDF 文件，按文本块提取，过滤图片块，保持阅读顺序"""
    text_parts: list[str] = []
-    with pdfplumber.open(io.BytesIO(content)) as pdf:
-        for page in pdf.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text_parts.append(page_text)
+    with fitz.open(stream=content, filetype="pdf") as doc:
+        for page in doc:
+            for b in page.get_text("blocks", sort=True):
+                if b[6] == 0 and b[4].strip():  # type 0=文本块, 1=图片块
+                    text_parts.append(b[4].strip())
    return "\n".join(text_parts)


@@ -28,22 +29,16 @@ def parse_docx(content: bytes) -> str:
        doc = Document(io.BytesIO(content))
    except Exception:
        raise ValueError("无法解析该 Word 文件，如果是旧版 .doc 格式，请另存为 .docx 后重试")
-
    text_parts: list[str] = []
-
-    # 段落
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            text_parts.append(text)
-
-    # 表格
    for table in doc.tables:
        for row in table.rows:
            row_text = "\t".join(cell.text.strip() for cell in row.cells)
            if row_text.strip():
                text_parts.append(row_text)
-
    return "\n".join(text_parts)


@@ -61,7 +56,6 @@ def parse_to_text(filename: str, content: bytes) -> str:
    """根据文件名后缀自动选择解析方法，返回纯文本"""
    suffix = filename[filename.rfind("."):].lower() if "." in filename else ""
    log.info(f"解析文件: {filename}，类型: {suffix}")
-
    if suffix == ".pdf":
        return parse_pdf(content)
    elif suffix in (".docx", ".doc"):