优化简历提取速度
This commit is contained in:
+8
-14
@@ -1,24 +1,25 @@
|
||||
"""文件解析工具
|
||||
|
||||
将上传的简历文件(PDF / Word / TXT)转换为纯文本字符串。
|
||||
PDF 使用 PyMuPDF (fitz) 按文本块提取,保持段落边界和阅读顺序。
|
||||
"""
|
||||
|
||||
import io
|
||||
|
||||
import pdfplumber
|
||||
import fitz
|
||||
from docx import Document
|
||||
|
||||
from app.core.logger import log
|
||||
|
||||
|
||||
def parse_pdf(content: bytes) -> str:
|
||||
"""解析 PDF 文件,提取全部页面文本"""
|
||||
"""解析 PDF 文件,按文本块提取,过滤图片块,保持阅读顺序"""
|
||||
text_parts: list[str] = []
|
||||
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
||||
for page in pdf.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_parts.append(page_text)
|
||||
with fitz.open(stream=content, filetype="pdf") as doc:
|
||||
for page in doc:
|
||||
for b in page.get_text("blocks", sort=True):
|
||||
if b[6] == 0 and b[4].strip(): # type 0=文本块, 1=图片块
|
||||
text_parts.append(b[4].strip())
|
||||
return "\n".join(text_parts)
|
||||
|
||||
|
||||
@@ -28,22 +29,16 @@ def parse_docx(content: bytes) -> str:
|
||||
doc = Document(io.BytesIO(content))
|
||||
except Exception:
|
||||
raise ValueError("无法解析该 Word 文件,如果是旧版 .doc 格式,请另存为 .docx 后重试")
|
||||
|
||||
text_parts: list[str] = []
|
||||
|
||||
# 段落
|
||||
for para in doc.paragraphs:
|
||||
text = para.text.strip()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
# 表格
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
row_text = "\t".join(cell.text.strip() for cell in row.cells)
|
||||
if row_text.strip():
|
||||
text_parts.append(row_text)
|
||||
|
||||
return "\n".join(text_parts)
|
||||
|
||||
|
||||
@@ -61,7 +56,6 @@ def parse_to_text(filename: str, content: bytes) -> str:
|
||||
"""根据文件名后缀自动选择解析方法,返回纯文本"""
|
||||
suffix = filename[filename.rfind("."):].lower() if "." in filename else ""
|
||||
log.info(f"解析文件: {filename},类型: {suffix}")
|
||||
|
||||
if suffix == ".pdf":
|
||||
return parse_pdf(content)
|
||||
elif suffix in (".docx", ".doc"):
|
||||
|
||||
Reference in New Issue
Block a user