添加文件解析能力

2026-04-02 14:55:05 +08:00
parent 387c4e6f7e
commit 4de721ffca
4 changed files with 78 additions and 0 deletions
@@ -0,0 +1,68 @@
+"""文件解析工具
+
+将上传的简历文件（PDF / Word / TXT）转换为纯文本字符串。
+"""
+
+import io
+
+import pdfplumber
+from docx import Document
+
+from app.core.logger import log
+
+
+def parse_pdf(content: bytes) -> str:
+    """解析 PDF 文件，提取全部页面文本"""
+    text_parts: list[str] = []
+    with pdfplumber.open(io.BytesIO(content)) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text_parts.append(page_text)
+    return "\n".join(text_parts)
+
+
+def parse_docx(content: bytes) -> str:
+    """解析 Word (.docx) 文件，提取段落和表格文本"""
+    doc = Document(io.BytesIO(content))
+    text_parts: list[str] = []
+
+    # 段落
+    for para in doc.paragraphs:
+        text = para.text.strip()
+        if text:
+            text_parts.append(text)
+
+    # 表格
+    for table in doc.tables:
+        for row in table.rows:
+            row_text = "\t".join(cell.text.strip() for cell in row.cells)
+            if row_text.strip():
+                text_parts.append(row_text)
+
+    return "\n".join(text_parts)
+
+
+def parse_txt(content: bytes) -> str:
+    """解析 TXT 文件，自动检测编码"""
+    for encoding in ("utf-8", "gbk", "gb2312", "latin-1"):
+        try:
+            return content.decode(encoding)
+        except (UnicodeDecodeError, LookupError):
+            continue
+    return content.decode("utf-8", errors="replace")
+
+
+def parse_to_text(filename: str, content: bytes) -> str:
+    """根据文件名后缀自动选择解析方法，返回纯文本"""
+    suffix = filename[filename.rfind("."):].lower() if "." in filename else ""
+    log.info(f"解析文件: {filename}，类型: {suffix}")
+
+    if suffix == ".pdf":
+        return parse_pdf(content)
+    elif suffix in (".docx", ".doc"):
+        return parse_docx(content)
+    elif suffix == ".txt":
+        return parse_txt(content)
+    else:
+        raise ValueError(f"不支持的文件类型: {suffix}，支持: .pdf, .docx, .doc, .txt")