"""文件解析工具 将上传的简历文件(PDF / Word / TXT)转换为纯文本字符串。 PDF 使用 PyMuPDF (fitz) 按文本块提取,保持段落边界和阅读顺序。 """ import io import fitz from docx import Document from app.core.logger import log def parse_pdf(content: bytes) -> str: """解析 PDF 文件,按文本块提取,过滤图片块,保持阅读顺序""" text_parts: list[str] = [] with fitz.open(stream=content, filetype="pdf") as doc: for page in doc: for b in page.get_text("blocks", sort=True): if b[6] == 0 and b[4].strip(): # type 0=文本块, 1=图片块 text_parts.append(b[4].strip()) return "\n".join(text_parts) def parse_docx(content: bytes) -> str: """解析 Word (.docx) 文件,提取段落和表格文本""" try: doc = Document(io.BytesIO(content)) except Exception: raise ValueError("无法解析该 Word 文件,如果是旧版 .doc 格式,请另存为 .docx 后重试") text_parts: list[str] = [] for para in doc.paragraphs: text = para.text.strip() if text: text_parts.append(text) for table in doc.tables: for row in table.rows: row_text = "\t".join(cell.text.strip() for cell in row.cells) if row_text.strip(): text_parts.append(row_text) return "\n".join(text_parts) def parse_txt(content: bytes) -> str: """解析 TXT 文件,自动检测编码""" for encoding in ("utf-8", "gbk", "gb2312", "latin-1"): try: return content.decode(encoding) except (UnicodeDecodeError, LookupError): continue return content.decode("utf-8", errors="replace") def parse_to_text(filename: str, content: bytes) -> str: """根据文件名后缀自动选择解析方法,返回纯文本""" suffix = filename[filename.rfind("."):].lower() if "." in filename else "" log.info(f"解析文件: {filename},类型: {suffix}") if suffix == ".pdf": return parse_pdf(content) elif suffix in (".docx", ".doc"): return parse_docx(content) elif suffix == ".txt": return parse_txt(content) else: raise ValueError(f"不支持的文件类型: {suffix},支持: .pdf, .docx, .doc, .txt")