307 lines
11 KiB
Python
307 lines
11 KiB
Python
"""岗位清洗服务(协程版)"""
|
|
|
|
import asyncio
|
|
import json
|
|
from datetime import datetime
|
|
|
|
from snowflake import SnowflakeGenerator
|
|
from sqlalchemy import text, insert
|
|
|
|
from app.config import settings
|
|
from app.core.database import PgSession, MysqlSession
|
|
from app.core.logger import log
|
|
from app.ai.model_config import JobCleanModel
|
|
from app.ai.prompts import JOB_STRUCTURE_SYSTEM, MAJOR_MATCH_SYSTEM, SKILL_EXTRACT_SYSTEM
|
|
from app.models.mysql.job import Job
|
|
from app.models.mysql.company import Company
|
|
from app.models.mysql.relations import JobRegionRelation, JobSkillTagRelation
|
|
from app.services.ai_tool import ai_chat_json
|
|
from app.services.dict_cache_service import dict_cache
|
|
|
|
# 雪花ID生成器
|
|
_id_gen = SnowflakeGenerator(instance=1)
|
|
|
|
# 公司创建锁(防止并发重复插入同一公司)
|
|
_company_lock = asyncio.Lock()
|
|
|
|
|
|
async def run_job_clean() -> None:
|
|
"""一次批量清洗任务"""
|
|
# 1. 从 PG 锁定一批待清洗数据
|
|
async with PgSession() as pg:
|
|
result = await pg.execute(
|
|
text("""
|
|
SELECT * FROM app_job_data
|
|
WHERE clean_status = 'pending'
|
|
LIMIT :limit
|
|
FOR UPDATE SKIP LOCKED
|
|
"""),
|
|
{"limit": settings.clean_batch_size},
|
|
)
|
|
rows = result.mappings().all()
|
|
if not rows:
|
|
return
|
|
|
|
ids = [r["id"] for r in rows]
|
|
await pg.execute(
|
|
text("""
|
|
UPDATE app_job_data
|
|
SET clean_status = 'cleaning', clean_started_at = NOW()
|
|
WHERE id = ANY(:ids)
|
|
"""),
|
|
{"ids": ids},
|
|
)
|
|
await pg.commit()
|
|
|
|
log.info("岗位清洗:锁定{}条数据", len(rows))
|
|
|
|
# 2. 协程并发清洗,信号量限流
|
|
sem = asyncio.Semaphore(settings.clean_concurrency)
|
|
tasks = [_clean_one(sem, dict(r)) for r in rows]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
# 汇总
|
|
errors = sum(1 for r in results if isinstance(r, Exception))
|
|
log.info("岗位清洗:本批完成,共{}条,异常{}条", len(rows), errors)
|
|
|
|
|
|
async def _clean_one(sem: asyncio.Semaphore, data: dict) -> None:
|
|
"""单条岗位清洗"""
|
|
async with sem:
|
|
try:
|
|
await _do_clean(data)
|
|
except Exception as e:
|
|
log.error("岗位清洗异常, id={}: {}", data["id"], e)
|
|
# 保持 cleaning 状态,由僵尸恢复任务重置
|
|
|
|
|
|
async def _do_clean(data: dict) -> None:
|
|
"""清洗逻辑"""
|
|
data_id = data["id"]
|
|
|
|
# 前置校验
|
|
description = data.get("description") or ""
|
|
if len(description) < 20:
|
|
log.info("[id={}] 丢弃:描述过短({}字符)", data_id, len(description))
|
|
await _update_pg_status(data_id, "discarded")
|
|
return
|
|
|
|
# 第一次AI:结构化提取
|
|
user_message = _build_user_message(data)
|
|
result = await ai_chat_json(JobCleanModel.STRUCTURE, JOB_STRUCTURE_SYSTEM, user_message)
|
|
if result is None or not result.get("valid", False):
|
|
log.info("[id={}] 丢弃:AI判定无效", data_id)
|
|
await _update_pg_status(data_id, "discarded")
|
|
return
|
|
|
|
# 去重检查
|
|
source_id = str(data_id)
|
|
async with MysqlSession() as mysql:
|
|
existing = await mysql.execute(
|
|
text("SELECT COUNT(*) AS cnt FROM bg_job WHERE source_id = :sid"),
|
|
{"sid": source_id},
|
|
)
|
|
if existing.scalar() > 0:
|
|
log.info("[id={}] 跳过:已入库(去重)", data_id)
|
|
await _update_pg_status(data_id, "cleaned")
|
|
return
|
|
|
|
# 公司处理
|
|
company_short_name = result.get("companyShortName") or data.get("company") or ""
|
|
company_id = await _find_or_create_company(company_short_name)
|
|
|
|
# 地区处理
|
|
region_codes = []
|
|
for city in result.get("cities") or []:
|
|
code = dict_cache.match_region_code(city)
|
|
if code:
|
|
region_codes.append(code)
|
|
|
|
# 写入 bg_job
|
|
job_id = next(_id_gen)
|
|
now = datetime.now()
|
|
async with MysqlSession() as mysql:
|
|
await mysql.execute(
|
|
insert(Job).values(
|
|
id=job_id,
|
|
title=result.get("title", ""),
|
|
company_id=company_id,
|
|
category_id=result.get("categoryId", 0),
|
|
employment_type=result.get("employmentType", 0),
|
|
description=result.get("description", ""),
|
|
requirement=result.get("requirement", ""),
|
|
bonus=result.get("bonus"),
|
|
tags=result.get("tags"),
|
|
skill_tags=result.get("skillTags"),
|
|
salary=result.get("salary"),
|
|
education=result.get("education", 0),
|
|
min_experience=result.get("minExperience", 0),
|
|
required_industry_id=result.get("requiredIndustryId"),
|
|
recruit_category=data.get("recruit_category", 3),
|
|
expire_at=data.get("expire_at"),
|
|
source_url=data.get("detail_url"),
|
|
source_id=source_id,
|
|
status=0,
|
|
create_time=now,
|
|
update_time=now,
|
|
)
|
|
)
|
|
|
|
# 写入地区关联
|
|
if region_codes:
|
|
await mysql.execute(
|
|
insert(JobRegionRelation),
|
|
[{"id": next(_id_gen), "job_id": job_id, "region_code": code, "create_time": now} for code in region_codes],
|
|
)
|
|
|
|
await mysql.commit()
|
|
|
|
# 更新 PG 状态
|
|
await _update_pg_status(data_id, "cleaned")
|
|
log.info("[id={}] 入库成功:{} | 公司={} | 地区={}", data_id, result.get("title"), company_short_name, region_codes)
|
|
|
|
# 第二次AI:专业匹配(失败不影响)
|
|
try:
|
|
await _match_major(job_id, result)
|
|
log.debug("[id={}] 专业匹配完成", data_id)
|
|
except Exception as e:
|
|
log.warning("[id={}] 专业匹配失败: {}", data_id, e)
|
|
|
|
# 第三次AI:技能提取(失败不影响)
|
|
try:
|
|
await _extract_skill_tags(job_id, result)
|
|
log.debug("[id={}] 技能提取完成", data_id)
|
|
except Exception as e:
|
|
log.warning("[id={}] 技能提取失败: {}", data_id, e)
|
|
|
|
|
|
async def _match_major(job_id: int, result: dict) -> None:
|
|
"""第二次AI:专业匹配"""
|
|
title = result.get("title", "")
|
|
desc = result.get("description", "")
|
|
req = result.get("requirement", "")
|
|
user_msg = f"【岗位信息】\n标题: {title}\n职责: {desc}\n要求: {req}\n\n【专业分类列表】\n{dict_cache.major_category_text}"
|
|
|
|
data = await ai_chat_json(JobCleanModel.MAJOR_MATCH, MAJOR_MATCH_SYSTEM, user_msg)
|
|
if data is None:
|
|
return
|
|
|
|
major_ids = [mid for mid in (data.get("requiredMajorIds") or []) if mid > 0]
|
|
sensitivity = data.get("majorSensitivity", 0)
|
|
|
|
async with MysqlSession() as mysql:
|
|
await mysql.execute(
|
|
text("""
|
|
UPDATE bg_job SET required_major_ids = :ids, major_sensitivity = :s, update_time = :t
|
|
WHERE id = :jid
|
|
"""),
|
|
{"ids": json.dumps(major_ids) if major_ids else None, "s": sensitivity, "t": datetime.now(), "jid": job_id},
|
|
)
|
|
await mysql.commit()
|
|
|
|
|
|
async def _extract_skill_tags(job_id: int, result: dict) -> None:
|
|
"""第三次AI:技能提取"""
|
|
title = result.get("title", "")
|
|
desc = result.get("description", "")
|
|
req = result.get("requirement", "")
|
|
user_msg = f"【岗位信息】\n标题: {title}\n职责: {desc}\n要求: {req}"
|
|
|
|
skills = await ai_chat_json(JobCleanModel.SKILL_EXTRACT, SKILL_EXTRACT_SYSTEM, user_msg)
|
|
if not skills or not isinstance(skills, list):
|
|
return
|
|
|
|
now = datetime.now()
|
|
tag_ids = []
|
|
async with MysqlSession() as mysql:
|
|
for name in skills:
|
|
name = str(name).strip().lower()
|
|
if not name or len(name) > 50:
|
|
continue
|
|
|
|
tag_id = next(_id_gen)
|
|
# INSERT IGNORE
|
|
await mysql.execute(
|
|
text("INSERT IGNORE INTO bg_skill_tag (id, name) VALUES (:id, :name)"),
|
|
{"id": tag_id, "name": name},
|
|
)
|
|
# 查回真实ID
|
|
row = await mysql.execute(
|
|
text("SELECT id FROM bg_skill_tag WHERE name = :name LIMIT 1"),
|
|
{"name": name},
|
|
)
|
|
real_id = row.scalar()
|
|
if real_id and real_id not in tag_ids:
|
|
tag_ids.append(real_id)
|
|
|
|
if tag_ids:
|
|
await mysql.execute(
|
|
insert(JobSkillTagRelation),
|
|
[{"id": next(_id_gen), "job_id": job_id, "skill_tag_id": tid, "create_time": now} for tid in tag_ids],
|
|
)
|
|
|
|
await mysql.commit()
|
|
|
|
|
|
async def _find_or_create_company(short_name: str) -> int:
|
|
"""查找或创建公司(加锁防并发重复)"""
|
|
async with _company_lock:
|
|
async with MysqlSession() as mysql:
|
|
row = await mysql.execute(
|
|
text("SELECT id FROM bg_company WHERE short_name = :name LIMIT 1"),
|
|
{"name": short_name},
|
|
)
|
|
existing = row.scalar()
|
|
if existing:
|
|
return existing
|
|
|
|
company_id = next(_id_gen)
|
|
now = datetime.now()
|
|
await mysql.execute(
|
|
insert(Company).values(
|
|
id=company_id,
|
|
name=short_name,
|
|
short_name=short_name,
|
|
status=0,
|
|
create_time=now,
|
|
update_time=now,
|
|
)
|
|
)
|
|
await mysql.commit()
|
|
return company_id
|
|
|
|
|
|
async def _update_pg_status(data_id: int, status: str) -> None:
|
|
"""更新 PG 清洗状态"""
|
|
async with PgSession() as pg:
|
|
if status == "cleaned":
|
|
await pg.execute(
|
|
text("UPDATE app_job_data SET clean_status = :s, cleaned_at = NOW() WHERE id = :id"),
|
|
{"s": status, "id": data_id},
|
|
)
|
|
else:
|
|
await pg.execute(
|
|
text("UPDATE app_job_data SET clean_status = :s WHERE id = :id"),
|
|
{"s": status, "id": data_id},
|
|
)
|
|
await pg.commit()
|
|
|
|
|
|
def _build_user_message(data: dict) -> str:
|
|
"""构建第一次AI的用户消息"""
|
|
parts = [
|
|
"【原始数据】",
|
|
f"岗位名称: {data.get('job_title') or ''}",
|
|
f"薪资: {data.get('salary') or ''}",
|
|
f"工作地点: {data.get('location') or ''}",
|
|
f"公司: {data.get('company') or ''}",
|
|
f"经验要求: {data.get('experience') or ''}",
|
|
f"学历要求: {data.get('education') or ''}",
|
|
f"岗位详情: {data.get('description') or ''}",
|
|
"",
|
|
f"【岗位分类列表】\n{dict_cache.job_category_text}",
|
|
"",
|
|
f"【行业列表】\n{dict_cache.industry_text}",
|
|
]
|
|
return "\n".join(parts)
|