From b9fa4a257b9b92bc40d86980c226035fe83514bb Mon Sep 17 00:00:00 2001 From: zk Date: Thu, 2 Apr 2026 16:02:35 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=96=87=E4=BB=B6=E7=B1=BB?= =?UTF-8?q?=E5=9E=8B=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/tool/file_parser.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/app/tool/file_parser.py b/app/tool/file_parser.py index 04a2f9b..2920d35 100644 --- a/app/tool/file_parser.py +++ b/app/tool/file_parser.py @@ -24,7 +24,11 @@ def parse_pdf(content: bytes) -> str: def parse_docx(content: bytes) -> str: """解析 Word (.docx) 文件,提取段落和表格文本""" - doc = Document(io.BytesIO(content)) + try: + doc = Document(io.BytesIO(content)) + except Exception: + raise ValueError("无法解析该 Word 文件,如果是旧版 .doc 格式,请另存为 .docx 后重试") + text_parts: list[str] = [] # 段落 @@ -60,9 +64,9 @@ def parse_to_text(filename: str, content: bytes) -> str: if suffix == ".pdf": return parse_pdf(content) - elif suffix == ".docx": + elif suffix in (".docx", ".doc"): return parse_docx(content) elif suffix == ".txt": return parse_txt(content) else: - raise ValueError(f"不支持的文件类型: {suffix},支持: .pdf, .docx, .txt") + raise ValueError(f"不支持的文件类型: {suffix},支持: .pdf, .docx, .doc, .txt")