优化json提取格式
This commit is contained in:
+20
-34
@@ -1,47 +1,31 @@
|
|||||||
"""AI 调用工具封装"""
|
"""AI 调用工具封装"""
|
||||||
|
|
||||||
import json
|
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from json_repair import repair_json
|
||||||
from langchain_openai import ChatOpenAI
|
from langchain_openai import ChatOpenAI
|
||||||
from langchain_core.messages import SystemMessage, HumanMessage
|
from langchain_core.messages import SystemMessage, HumanMessage
|
||||||
|
|
||||||
from app.core.logger import log
|
from app.core.logger import log
|
||||||
|
|
||||||
# markdown 代码块正则
|
# 匹配 <think>任意内容</think>,用于剥离推理模型的思考过程
|
||||||
_CODE_BLOCK_RE = re.compile(r"```\w*\s*\n?(.*?)\n?\s*```", re.DOTALL)
|
_THINK_RE = re.compile(r"<think>.*?</think>", re.DOTALL | re.IGNORECASE)
|
||||||
# 控制字符正则(保留 \t \n \r)
|
|
||||||
_CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
|
# 匹配 ```json ... ``` 代码块,提取中间的 JSON 内容
|
||||||
|
_CODE_BLOCK_RE = re.compile(r"```(?:json\w*)?\s*\n?(.*?)\n?\s*```", re.DOTALL | re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
def clean_ai_response(response: str) -> str:
|
def parse_llm_json(text: str) -> Any:
|
||||||
"""从 AI 返回的文本中提取干净的 JSON 字符串"""
|
"""解析 AI 输出的 JSON,自动去除思考标签、markdown 代码块,容错处理"""
|
||||||
if not response or not response.strip():
|
# 1. 去掉 <think>...</think> 思考内容
|
||||||
return ""
|
cleaned = _THINK_RE.sub("", text).strip()
|
||||||
|
# 2. 如果有 ```json ... ``` 代码块,只取代码块里的内容
|
||||||
result = response.strip()
|
match = _CODE_BLOCK_RE.search(cleaned)
|
||||||
|
|
||||||
# 尝试从 markdown 代码块提取
|
|
||||||
match = _CODE_BLOCK_RE.search(result)
|
|
||||||
if match:
|
if match:
|
||||||
result = match.group(1).strip()
|
cleaned = match.group(1).strip()
|
||||||
else:
|
# 3. repair_json 容错解析:修复不规范的 JSON(多余逗号、缺引号、非法转义等)
|
||||||
# 定位首个 JSON 起始符
|
return repair_json(cleaned, return_objects=True)
|
||||||
obj_start = result.find("{")
|
|
||||||
arr_start = result.find("[")
|
|
||||||
if obj_start < 0:
|
|
||||||
start = arr_start
|
|
||||||
elif arr_start < 0:
|
|
||||||
start = obj_start
|
|
||||||
else:
|
|
||||||
start = min(obj_start, arr_start)
|
|
||||||
if start > 0:
|
|
||||||
result = result[start:]
|
|
||||||
|
|
||||||
# 清除控制字符
|
|
||||||
result = _CONTROL_CHAR_RE.sub("", result)
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
async def ai_chat(llm: ChatOpenAI, system_prompt: str, user_message: str) -> str:
|
async def ai_chat(llm: ChatOpenAI, system_prompt: str, user_message: str) -> str:
|
||||||
@@ -57,9 +41,11 @@ async def ai_chat(llm: ChatOpenAI, system_prompt: str, user_message: str) -> str
|
|||||||
async def ai_chat_json(llm: ChatOpenAI, system_prompt: str, user_message: str) -> Any:
|
async def ai_chat_json(llm: ChatOpenAI, system_prompt: str, user_message: str) -> Any:
|
||||||
"""异步调用 LLM,返回解析后的 JSON 对象"""
|
"""异步调用 LLM,返回解析后的 JSON 对象"""
|
||||||
raw = await ai_chat(llm, system_prompt, user_message)
|
raw = await ai_chat(llm, system_prompt, user_message)
|
||||||
cleaned = clean_ai_response(raw)
|
if not raw or not raw.strip():
|
||||||
|
log.warning("AI 返回为空")
|
||||||
|
return None
|
||||||
try:
|
try:
|
||||||
return json.loads(cleaned)
|
return parse_llm_json(raw)
|
||||||
except json.JSONDecodeError as e:
|
except Exception as e:
|
||||||
log.warning("AI JSON 解析失败: {}, raw={}", e, raw[:200])
|
log.warning("AI JSON 解析失败: {}, raw={}", e, raw[:200])
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -128,6 +128,12 @@ async def _do_clean(data: dict) -> None:
|
|||||||
# 写入 bg_job
|
# 写入 bg_job
|
||||||
job_id = next(_id_gen)
|
job_id = next(_id_gen)
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
|
|
||||||
|
# expire_at 校验:超出合理范围的设为 NULL
|
||||||
|
expire_at = data.get("expire_at")
|
||||||
|
if expire_at and isinstance(expire_at, datetime) and expire_at.year > 2035:
|
||||||
|
expire_at = None
|
||||||
|
|
||||||
async with MysqlSession() as mysql:
|
async with MysqlSession() as mysql:
|
||||||
await mysql.execute(
|
await mysql.execute(
|
||||||
insert(Job).values(
|
insert(Job).values(
|
||||||
@@ -146,10 +152,10 @@ async def _do_clean(data: dict) -> None:
|
|||||||
min_experience=result.get("minExperience", 0),
|
min_experience=result.get("minExperience", 0),
|
||||||
required_industry_id=result.get("requiredIndustryId"),
|
required_industry_id=result.get("requiredIndustryId"),
|
||||||
recruit_category=data.get("recruit_category", 3),
|
recruit_category=data.get("recruit_category", 3),
|
||||||
expire_at=data.get("expire_at"),
|
expire_at=expire_at,
|
||||||
content_hash=data.get("content_hash"),
|
content_hash=data.get("content_hash"),
|
||||||
source_url=data.get("detail_url"),
|
source_url=data.get("detail_url"),
|
||||||
source_id=source_id,
|
source_id=str(data_id),
|
||||||
status=0,
|
status=0,
|
||||||
create_time=now,
|
create_time=now,
|
||||||
update_time=now,
|
update_time=now,
|
||||||
|
|||||||
@@ -12,3 +12,4 @@ langchain-core>=0.3
|
|||||||
# 工具
|
# 工具
|
||||||
loguru>=0.7
|
loguru>=0.7
|
||||||
snowflake-id>=1.0
|
snowflake-id>=1.0
|
||||||
|
json-repair>=0.30
|
||||||
Reference in New Issue
Block a user