diff --git a/app/services/ai_tool.py b/app/services/ai_tool.py index 6ff9837..3b87c4b 100644 --- a/app/services/ai_tool.py +++ b/app/services/ai_tool.py @@ -1,47 +1,31 @@ """AI 调用工具封装""" -import json import re from typing import Any +from json_repair import repair_json from langchain_openai import ChatOpenAI from langchain_core.messages import SystemMessage, HumanMessage from app.core.logger import log -# markdown 代码块正则 -_CODE_BLOCK_RE = re.compile(r"```\w*\s*\n?(.*?)\n?\s*```", re.DOTALL) -# 控制字符正则(保留 \t \n \r) -_CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]") +# 匹配 任意内容,用于剥离推理模型的思考过程 +_THINK_RE = re.compile(r".*?", re.DOTALL | re.IGNORECASE) + +# 匹配 ```json ... ``` 代码块,提取中间的 JSON 内容 +_CODE_BLOCK_RE = re.compile(r"```(?:json\w*)?\s*\n?(.*?)\n?\s*```", re.DOTALL | re.IGNORECASE) -def clean_ai_response(response: str) -> str: - """从 AI 返回的文本中提取干净的 JSON 字符串""" - if not response or not response.strip(): - return "" - - result = response.strip() - - # 尝试从 markdown 代码块提取 - match = _CODE_BLOCK_RE.search(result) +def parse_llm_json(text: str) -> Any: + """解析 AI 输出的 JSON,自动去除思考标签、markdown 代码块,容错处理""" + # 1. 去掉 ... 思考内容 + cleaned = _THINK_RE.sub("", text).strip() + # 2. 如果有 ```json ... ``` 代码块,只取代码块里的内容 + match = _CODE_BLOCK_RE.search(cleaned) if match: - result = match.group(1).strip() - else: - # 定位首个 JSON 起始符 - obj_start = result.find("{") - arr_start = result.find("[") - if obj_start < 0: - start = arr_start - elif arr_start < 0: - start = obj_start - else: - start = min(obj_start, arr_start) - if start > 0: - result = result[start:] - - # 清除控制字符 - result = _CONTROL_CHAR_RE.sub("", result) - return result + cleaned = match.group(1).strip() + # 3. repair_json 容错解析:修复不规范的 JSON(多余逗号、缺引号、非法转义等) + return repair_json(cleaned, return_objects=True) async def ai_chat(llm: ChatOpenAI, system_prompt: str, user_message: str) -> str: @@ -57,9 +41,11 @@ async def ai_chat(llm: ChatOpenAI, system_prompt: str, user_message: str) -> str async def ai_chat_json(llm: ChatOpenAI, system_prompt: str, user_message: str) -> Any: """异步调用 LLM,返回解析后的 JSON 对象""" raw = await ai_chat(llm, system_prompt, user_message) - cleaned = clean_ai_response(raw) + if not raw or not raw.strip(): + log.warning("AI 返回为空") + return None try: - return json.loads(cleaned) - except json.JSONDecodeError as e: + return parse_llm_json(raw) + except Exception as e: log.warning("AI JSON 解析失败: {}, raw={}", e, raw[:200]) return None diff --git a/app/services/job_clean_service.py b/app/services/job_clean_service.py index 993ace1..4c872e2 100644 --- a/app/services/job_clean_service.py +++ b/app/services/job_clean_service.py @@ -128,6 +128,12 @@ async def _do_clean(data: dict) -> None: # 写入 bg_job job_id = next(_id_gen) now = datetime.now() + + # expire_at 校验:超出合理范围的设为 NULL + expire_at = data.get("expire_at") + if expire_at and isinstance(expire_at, datetime) and expire_at.year > 2035: + expire_at = None + async with MysqlSession() as mysql: await mysql.execute( insert(Job).values( @@ -146,10 +152,10 @@ async def _do_clean(data: dict) -> None: min_experience=result.get("minExperience", 0), required_industry_id=result.get("requiredIndustryId"), recruit_category=data.get("recruit_category", 3), - expire_at=data.get("expire_at"), + expire_at=expire_at, content_hash=data.get("content_hash"), source_url=data.get("detail_url"), - source_id=source_id, + source_id=str(data_id), status=0, create_time=now, update_time=now, diff --git a/requirements.txt b/requirements.txt index c7ddeb9..f5a3904 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ langchain-core>=0.3 # 工具 loguru>=0.7 -snowflake-id>=1.0 \ No newline at end of file +snowflake-id>=1.0 +json-repair>=0.30 \ No newline at end of file