69 lines
2.0 KiB
Python
69 lines
2.0 KiB
Python
"""文件解析工具
|
|
|
|
将上传的简历文件(PDF / Word / TXT)转换为纯文本字符串。
|
|
"""
|
|
|
|
import io
|
|
|
|
import pdfplumber
|
|
from docx import Document
|
|
|
|
from app.core.logger import log
|
|
|
|
|
|
def parse_pdf(content: bytes) -> str:
|
|
"""解析 PDF 文件,提取全部页面文本"""
|
|
text_parts: list[str] = []
|
|
with pdfplumber.open(io.BytesIO(content)) as pdf:
|
|
for page in pdf.pages:
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
text_parts.append(page_text)
|
|
return "\n".join(text_parts)
|
|
|
|
|
|
def parse_docx(content: bytes) -> str:
|
|
"""解析 Word (.docx) 文件,提取段落和表格文本"""
|
|
doc = Document(io.BytesIO(content))
|
|
text_parts: list[str] = []
|
|
|
|
# 段落
|
|
for para in doc.paragraphs:
|
|
text = para.text.strip()
|
|
if text:
|
|
text_parts.append(text)
|
|
|
|
# 表格
|
|
for table in doc.tables:
|
|
for row in table.rows:
|
|
row_text = "\t".join(cell.text.strip() for cell in row.cells)
|
|
if row_text.strip():
|
|
text_parts.append(row_text)
|
|
|
|
return "\n".join(text_parts)
|
|
|
|
|
|
def parse_txt(content: bytes) -> str:
|
|
"""解析 TXT 文件,自动检测编码"""
|
|
for encoding in ("utf-8", "gbk", "gb2312", "latin-1"):
|
|
try:
|
|
return content.decode(encoding)
|
|
except (UnicodeDecodeError, LookupError):
|
|
continue
|
|
return content.decode("utf-8", errors="replace")
|
|
|
|
|
|
def parse_to_text(filename: str, content: bytes) -> str:
|
|
"""根据文件名后缀自动选择解析方法,返回纯文本"""
|
|
suffix = filename[filename.rfind("."):].lower() if "." in filename else ""
|
|
log.info(f"解析文件: {filename},类型: {suffix}")
|
|
|
|
if suffix == ".pdf":
|
|
return parse_pdf(content)
|
|
elif suffix in (".docx", ".doc"):
|
|
return parse_docx(content)
|
|
elif suffix == ".txt":
|
|
return parse_txt(content)
|
|
else:
|
|
raise ValueError(f"不支持的文件类型: {suffix},支持: .pdf, .docx, .doc, .txt")
|