抽象json提取风格
This commit is contained in:
@@ -140,6 +140,12 @@ inclusion: manual
|
|||||||
- AI 调用应做好异常捕获和容错,单次失败不应影响整体流程
|
- AI 调用应做好异常捕获和容错,单次失败不应影响整体流程
|
||||||
- 长耗时 AI 调用考虑异步执行
|
- 长耗时 AI 调用考虑异步执行
|
||||||
|
|
||||||
|
### AI 输出 JSON 解析
|
||||||
|
- LLM 返回的 JSON 经常被 markdown 代码块(` ```json ... ``` `)包裹,**禁止**直接使用 LangChain 的 `JsonOutputParser`
|
||||||
|
- 统一使用 `app.tool.json_helper.parse_llm_json` 解析 AI 输出的 JSON 文本
|
||||||
|
- `parse_llm_json` 会自动剥离 markdown 代码块标记,并通过 `json_repair` 做容错修复
|
||||||
|
- **不要**在各模块中自行编写 JSON 清洗/解析逻辑,统一复用 `parse_llm_json`
|
||||||
|
|
||||||
## 代码格式规范
|
## 代码格式规范
|
||||||
|
|
||||||
### 紧凑风格
|
### 紧凑风格
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ offerpie_python_ai/
|
|||||||
│
|
│
|
||||||
├─ tool/ # **工具层**(无状态、无业务依赖的通用工具)
|
├─ tool/ # **工具层**(无状态、无业务依赖的通用工具)
|
||||||
│ ├─ file_parser.py # 文件解析工具(PDF/Word/TXT → 纯文本,parse_to_text 入口方法)
|
│ ├─ file_parser.py # 文件解析工具(PDF/Word/TXT → 纯文本,parse_to_text 入口方法)
|
||||||
|
│ ├─ json_helper.py # AI 输出 JSON 解析工具(自动去除 markdown 代码块包裹 + json_repair 容错,parse_llm_json 入口方法)
|
||||||
│ └─ snowflake.py # 雪花 ID 生成工具(next_id)
|
│ └─ snowflake.py # 雪花 ID 生成工具(next_id)
|
||||||
│
|
│
|
||||||
├─ schemas/ # **Schema 层**(Pydantic 请求/响应/缓存模型)
|
├─ schemas/ # **Schema 层**(Pydantic 请求/响应/缓存模型)
|
||||||
@@ -84,7 +85,7 @@ offerpie_python_ai/
|
|||||||
| **ai** | AI 模型管理 + 业务 AI 能力 | `LLM` 枚举、`resume_extractor/`(简历并行提取)、`resume_diagnoser/`(简历诊断)、`skill_gap_analyzer/`(技能差距分析 + 定制简历优化 + Agent 原子化规划 + 单条记录修改/新增) |
|
| **ai** | AI 模型管理 + 业务 AI 能力 | `LLM` 枚举、`resume_extractor/`(简历并行提取)、`resume_diagnoser/`(简历诊断)、`skill_gap_analyzer/`(技能差距分析 + 定制简历优化 + Agent 原子化规划 + 单条记录修改/新增) |
|
||||||
| **api** | REST API 路由定义 | `health.py`(健康检查)、`resume.py`(简历上传解析)、`resume_diagnose.py`(简历诊断)、`skill_gap.py`(技能差距分析 + 定制简历) |
|
| **api** | REST API 路由定义 | `health.py`(健康检查)、`resume.py`(简历上传解析)、`resume_diagnose.py`(简历诊断)、`skill_gap.py`(技能差距分析 + 定制简历) |
|
||||||
| **models** | SQLAlchemy ORM 模型,与 Java 端共享同一数据库 | `FuncPermission`、`UserFuncPermissionStock`、`UserFuncUsageLog`、`UserResume`、`UserResumeEducation`/`Work`/`Internship`/`Project`/`Competition`、`ResumeDiagnosisReport`、`ResumeDiagnosisIssue`、`Job`(只读) |
|
| **models** | SQLAlchemy ORM 模型,与 Java 端共享同一数据库 | `FuncPermission`、`UserFuncPermissionStock`、`UserFuncUsageLog`、`UserResume`、`UserResumeEducation`/`Work`/`Internship`/`Project`/`Competition`、`ResumeDiagnosisReport`、`ResumeDiagnosisIssue`、`Job`(只读) |
|
||||||
| **tool** | 无状态通用工具,不依赖数据库/Redis/用户上下文 | `file_parser.py`(PDF/Word/TXT 文件解析为纯文本)、`snowflake.py`(雪花ID生成) |
|
| **tool** | 无状态通用工具,不依赖数据库/Redis/用户上下文 | `file_parser.py`(PDF/Word/TXT 文件解析为纯文本)、`json_helper.py`(AI 输出 JSON 解析,去 markdown 代码块 + json_repair 容错)、`snowflake.py`(雪花ID生成) |
|
||||||
| **services** | 业务逻辑实现 | `FuncPermissionService`(功能权限校验、扣减、回退)、`ResumeParseService`(简历文件解析→AI结构化→入库)、`ResumeDiagnoseService`(简历诊断→AI并行分析→评级→入库)、`SkillGapService`(技能差距分析→定制简历生成/查询/编辑/回滚→AI对话编辑(原子化操作:delete直接删/update按记录并发/add并发生成)) |
|
| **services** | 业务逻辑实现 | `FuncPermissionService`(功能权限校验、扣减、回退)、`ResumeParseService`(简历文件解析→AI结构化→入库)、`ResumeDiagnoseService`(简历诊断→AI并行分析→评级→入库)、`SkillGapService`(技能差距分析→定制简历生成/查询/编辑/回滚→AI对话编辑(原子化操作:delete直接删/update按记录并发/add并发生成)) |
|
||||||
|
|
||||||
## 3️⃣ 技术栈
|
## 3️⃣ 技术栈
|
||||||
|
|||||||
@@ -1,9 +1,6 @@
|
|||||||
"""简历诊断 AI 引擎:并行诊断 + 汇总评价"""
|
"""简历诊断 AI 引擎:并行诊断 + 汇总评价"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
|
||||||
|
|
||||||
from json_repair import repair_json
|
|
||||||
|
|
||||||
from langchain_core.output_parsers import StrOutputParser
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
from langchain_core.prompts import ChatPromptTemplate
|
from langchain_core.prompts import ChatPromptTemplate
|
||||||
@@ -11,13 +8,7 @@ from langchain_core.prompts import ChatPromptTemplate
|
|||||||
from app.ai.models import LLM
|
from app.ai.models import LLM
|
||||||
from app.ai.resume_diagnoser.prompts import DIAGNOSE_MODULE_PROMPT, SUMMARY_PROMPT, POLISH_PROMPT
|
from app.ai.resume_diagnoser.prompts import DIAGNOSE_MODULE_PROMPT, SUMMARY_PROMPT, POLISH_PROMPT
|
||||||
from app.core.logger import log
|
from app.core.logger import log
|
||||||
|
from app.tool.json_helper import parse_llm_json
|
||||||
|
|
||||||
def _parse_json(text: str) -> dict:
|
|
||||||
"""解析 AI 输出的 JSON,自动去除 markdown 代码块包裹,容错处理"""
|
|
||||||
cleaned = re.sub(r"^```(?:json)?\s*\n?", "", text.strip())
|
|
||||||
cleaned = re.sub(r"\n?```\s*$", "", cleaned)
|
|
||||||
return repair_json(cleaned, return_objects=True)
|
|
||||||
|
|
||||||
|
|
||||||
# 诊断链(StrOutputParser 拿原始文本,再手动解析 JSON,避免 markdown 代码块导致解析失败)
|
# 诊断链(StrOutputParser 拿原始文本,再手动解析 JSON,避免 markdown 代码块导致解析失败)
|
||||||
@@ -92,7 +83,7 @@ async def polish_content(module_type: str, reference_content: list[dict] | str |
|
|||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
raw = await _polish_chain.ainvoke(inp)
|
raw = await _polish_chain.ainvoke(inp)
|
||||||
result = _parse_json(raw)
|
result = parse_llm_json(raw)
|
||||||
if isinstance(result, list):
|
if isinstance(result, list):
|
||||||
return [str(item) for item in result]
|
return [str(item) for item in result]
|
||||||
return [str(result)]
|
return [str(result)]
|
||||||
@@ -106,7 +97,7 @@ async def _safe_invoke(task: dict) -> dict:
|
|||||||
raw = ""
|
raw = ""
|
||||||
try:
|
try:
|
||||||
raw = await _diagnose_chain.ainvoke(task)
|
raw = await _diagnose_chain.ainvoke(task)
|
||||||
return _parse_json(raw)
|
return parse_llm_json(raw)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"AI诊断[{task.get('module_type', '')}]失败: {e}\n原始输出: {raw[:500]}")
|
log.warning(f"AI诊断[{task.get('module_type', '')}]失败: {e}\n原始输出: {raw[:500]}")
|
||||||
return _empty_result()
|
return _empty_result()
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
"""简历并行提取:将完整简历文本拆分为5个AI任务并行提取"""
|
"""简历并行提取:将完整简历文本拆分为5个AI任务并行提取"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
|
||||||
|
|
||||||
from json_repair import repair_json
|
|
||||||
from langchain_core.output_parsers import StrOutputParser
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
from langchain_core.prompts import ChatPromptTemplate
|
from langchain_core.prompts import ChatPromptTemplate
|
||||||
|
|
||||||
@@ -13,13 +11,7 @@ from app.ai.resume_extractor.prompts import (
|
|||||||
PROJECT_PROMPT, COMPETITION_PROMPT,
|
PROJECT_PROMPT, COMPETITION_PROMPT,
|
||||||
)
|
)
|
||||||
from app.core.logger import log
|
from app.core.logger import log
|
||||||
|
from app.tool.json_helper import parse_llm_json
|
||||||
|
|
||||||
def _parse_json(text: str) -> dict:
|
|
||||||
"""解析 AI 输出的 JSON,自动去除 markdown 代码块包裹,容错处理"""
|
|
||||||
cleaned = re.sub(r"^```(?:json)?\s*\n?", "", text.strip())
|
|
||||||
cleaned = re.sub(r"\n?```\s*$", "", cleaned)
|
|
||||||
return repair_json(cleaned, return_objects=True)
|
|
||||||
|
|
||||||
|
|
||||||
def _build_chain(prompt: str):
|
def _build_chain(prompt: str):
|
||||||
@@ -65,7 +57,7 @@ async def _safe_invoke(chain, inp: dict, label: str):
|
|||||||
"""单个链调用,失败返回空"""
|
"""单个链调用,失败返回空"""
|
||||||
try:
|
try:
|
||||||
raw = await chain.ainvoke(inp)
|
raw = await chain.ainvoke(inp)
|
||||||
return _parse_json(raw)
|
return parse_llm_json(raw)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"AI提取[{label}]失败: {e}")
|
log.warning(f"AI提取[{label}]失败: {e}")
|
||||||
return {} if "个人信息" in label else []
|
return {} if "个人信息" in label else []
|
||||||
|
|||||||
@@ -5,9 +5,7 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
|
||||||
|
|
||||||
from json_repair import repair_json
|
|
||||||
from langchain_core.output_parsers import StrOutputParser
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
from langchain_core.prompts import ChatPromptTemplate
|
from langchain_core.prompts import ChatPromptTemplate
|
||||||
|
|
||||||
@@ -17,13 +15,7 @@ from app.ai.skill_gap_analyzer.prompts import (
|
|||||||
AGENT_PLAN_PROMPT, AGENT_MODULE_EDIT_PROMPT, AGENT_MODULE_ADD_PROMPT, MODULE_SCHEMAS,
|
AGENT_PLAN_PROMPT, AGENT_MODULE_EDIT_PROMPT, AGENT_MODULE_ADD_PROMPT, MODULE_SCHEMAS,
|
||||||
)
|
)
|
||||||
from app.core.logger import log
|
from app.core.logger import log
|
||||||
|
from app.tool.json_helper import parse_llm_json
|
||||||
|
|
||||||
def _parse_json(text: str):
|
|
||||||
"""解析 AI 输出的 JSON,自动去除 markdown 代码块包裹,容错处理"""
|
|
||||||
cleaned = re.sub(r"^```(?:json)?\s*\n?", "", text.strip())
|
|
||||||
cleaned = re.sub(r"\n?```\s*$", "", cleaned)
|
|
||||||
return repair_json(cleaned, return_objects=True)
|
|
||||||
|
|
||||||
|
|
||||||
# ===== 差距分析 =====
|
# ===== 差距分析 =====
|
||||||
@@ -39,7 +31,7 @@ async def analyze_skill_gap(skill_tags: list[str], resume_json: str) -> list[str
|
|||||||
"""分析技能差距,返回缺失技能列表"""
|
"""分析技能差距,返回缺失技能列表"""
|
||||||
try:
|
try:
|
||||||
raw = await _skill_gap_chain.ainvoke({"skill_tags": str(skill_tags), "resume_json": resume_json})
|
raw = await _skill_gap_chain.ainvoke({"skill_tags": str(skill_tags), "resume_json": resume_json})
|
||||||
result = _parse_json(raw)
|
result = parse_llm_json(raw)
|
||||||
if isinstance(result, list):
|
if isinstance(result, list):
|
||||||
return [s for s in result if isinstance(s, str) and s in skill_tags]
|
return [s for s in result if isinstance(s, str) and s in skill_tags]
|
||||||
return skill_tags # 解析异常降级:全部标记缺失
|
return skill_tags # 解析异常降级:全部标记缺失
|
||||||
@@ -85,7 +77,7 @@ async def optimize_module(job_title: str, job_description: str, module_data: str
|
|||||||
"job_title": job_title, "job_description": job_description or "",
|
"job_title": job_title, "job_description": job_description or "",
|
||||||
"original_module_data": module_data,
|
"original_module_data": module_data,
|
||||||
})
|
})
|
||||||
return _parse_json(raw)
|
return parse_llm_json(raw)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"AI优化经历模块失败: {e}")
|
log.warning(f"AI优化经历模块失败: {e}")
|
||||||
return None
|
return None
|
||||||
@@ -109,7 +101,7 @@ async def plan_edit(job_title: str, job_description: str, resume_json: str,
|
|||||||
"resume_json": resume_json,
|
"resume_json": resume_json,
|
||||||
"chat_history": chat_history, "instruction": instruction,
|
"chat_history": chat_history, "instruction": instruction,
|
||||||
})
|
})
|
||||||
result = _parse_json(raw)
|
result = parse_llm_json(raw)
|
||||||
return result if isinstance(result, dict) else None
|
return result if isinstance(result, dict) else None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"AI规划失败: {e}")
|
log.warning(f"AI规划失败: {e}")
|
||||||
@@ -135,7 +127,7 @@ async def execute_record_edit(job_title: str, job_description: str, instruction:
|
|||||||
"instruction": instruction, "chat_history": chat_history,
|
"instruction": instruction, "chat_history": chat_history,
|
||||||
"module_schema": module_schema, "record_data": record_data,
|
"module_schema": module_schema, "record_data": record_data,
|
||||||
})
|
})
|
||||||
return _parse_json(raw)
|
return parse_llm_json(raw)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"AI单条记录修改失败: {e}")
|
log.warning(f"AI单条记录修改失败: {e}")
|
||||||
return None
|
return None
|
||||||
@@ -159,7 +151,7 @@ async def execute_record_add(job_title: str, job_description: str, instruction:
|
|||||||
"instruction": instruction, "chat_history": chat_history,
|
"instruction": instruction, "chat_history": chat_history,
|
||||||
"module_schema": module_schema,
|
"module_schema": module_schema,
|
||||||
})
|
})
|
||||||
result = _parse_json(raw)
|
result = parse_llm_json(raw)
|
||||||
return result if isinstance(result, dict) else None
|
return result if isinstance(result, dict) else None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.warning(f"AI新增记录失败: {e}")
|
log.warning(f"AI新增记录失败: {e}")
|
||||||
|
|||||||
@@ -0,0 +1,15 @@
|
|||||||
|
"""AI 输出 JSON 解析工具
|
||||||
|
|
||||||
|
将 LLM 返回的可能带 markdown 代码块包裹的文本解析为 Python 对象。
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from json_repair import repair_json
|
||||||
|
|
||||||
|
|
||||||
|
def parse_llm_json(text: str):
|
||||||
|
"""解析 AI 输出的 JSON,自动去除 markdown 代码块包裹,容错处理"""
|
||||||
|
cleaned = re.sub(r"^```(?:json)?\s*\n?", "", text.strip())
|
||||||
|
cleaned = re.sub(r"\n?```\s*$", "", cleaned)
|
||||||
|
return repair_json(cleaned, return_objects=True)
|
||||||
Reference in New Issue
Block a user