添加文件类型处理
This commit is contained in:
@@ -24,7 +24,11 @@ def parse_pdf(content: bytes) -> str:
|
|||||||
|
|
||||||
def parse_docx(content: bytes) -> str:
|
def parse_docx(content: bytes) -> str:
|
||||||
"""解析 Word (.docx) 文件,提取段落和表格文本"""
|
"""解析 Word (.docx) 文件,提取段落和表格文本"""
|
||||||
doc = Document(io.BytesIO(content))
|
try:
|
||||||
|
doc = Document(io.BytesIO(content))
|
||||||
|
except Exception:
|
||||||
|
raise ValueError("无法解析该 Word 文件,如果是旧版 .doc 格式,请另存为 .docx 后重试")
|
||||||
|
|
||||||
text_parts: list[str] = []
|
text_parts: list[str] = []
|
||||||
|
|
||||||
# 段落
|
# 段落
|
||||||
@@ -60,9 +64,9 @@ def parse_to_text(filename: str, content: bytes) -> str:
|
|||||||
|
|
||||||
if suffix == ".pdf":
|
if suffix == ".pdf":
|
||||||
return parse_pdf(content)
|
return parse_pdf(content)
|
||||||
elif suffix == ".docx":
|
elif suffix in (".docx", ".doc"):
|
||||||
return parse_docx(content)
|
return parse_docx(content)
|
||||||
elif suffix == ".txt":
|
elif suffix == ".txt":
|
||||||
return parse_txt(content)
|
return parse_txt(content)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"不支持的文件类型: {suffix},支持: .pdf, .docx, .txt")
|
raise ValueError(f"不支持的文件类型: {suffix},支持: .pdf, .docx, .doc, .txt")
|
||||||
|
|||||||
Reference in New Issue
Block a user