初始化
This commit is contained in:
@@ -0,0 +1,22 @@
|
||||
"""AI 模型场景配置
|
||||
|
||||
集中管理清洗服务的模型选择与参数,修改模型只需改此文件。
|
||||
"""
|
||||
|
||||
from app.ai.models import LLM
|
||||
|
||||
|
||||
class JobCleanModel:
|
||||
"""岗位清洗模块"""
|
||||
# 第一次AI:结构化提取岗位信息
|
||||
STRUCTURE = LLM.DOUBAO_SEED_LITE.create(temperature=0)
|
||||
# 第二次AI:专业匹配
|
||||
MAJOR_MATCH = LLM.DOUBAO_SEED_LITE.create(temperature=0)
|
||||
# 第三次AI:技能提取
|
||||
SKILL_EXTRACT = LLM.DOUBAO_SEED_LITE.create(temperature=0)
|
||||
|
||||
|
||||
class CompanyCleanModel:
|
||||
"""公司补充模块"""
|
||||
# 公司信息补充
|
||||
ENRICH = LLM.DOUBAO_SEED_LITE.create(temperature=0)
|
||||
@@ -0,0 +1,41 @@
|
||||
"""LLM 模型枚举与实例获取
|
||||
|
||||
Usage:
|
||||
from app.ai.models import LLM
|
||||
|
||||
llm = LLM.DOUBAO_SEED_LITE.create(temperature=0)
|
||||
"""
|
||||
|
||||
from enum import Enum
|
||||
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
from app.config import settings
|
||||
|
||||
# 供应商连接配置
|
||||
_VOLCENGINE = (lambda: settings.volcengine_api_key, lambda: settings.volcengine_base_url)
|
||||
|
||||
|
||||
class LLM(Enum):
|
||||
"""所有可用模型,每个枚举值 = (模型名, api_key函数, base_url函数)"""
|
||||
|
||||
# 火山引擎
|
||||
DOUBAO_PRO_32K = ("doubao-1-5-pro-32k-250115", *_VOLCENGINE)
|
||||
DOUBAO_LITE_32K = ("doubao-1-5-lite-32k-250115", *_VOLCENGINE)
|
||||
DOUBAO_SEED_LITE = ("doubao-seed-2-0-lite-260215", *_VOLCENGINE)
|
||||
DOUBAO_SEED_PRO = ("doubao-seed-2-0-pro-260215", *_VOLCENGINE)
|
||||
DEEPSEEK_V4_FLASH = ("deepseek-v4-flash-260425", *_VOLCENGINE)
|
||||
|
||||
def __init__(self, model_name: str, api_key_fn, base_url_fn):
|
||||
self.model_name = model_name
|
||||
self._api_key_fn = api_key_fn
|
||||
self._base_url_fn = base_url_fn
|
||||
|
||||
def create(self, **kwargs) -> ChatOpenAI:
|
||||
"""创建 LLM 实例,kwargs 透传给 ChatOpenAI(temperature, max_tokens 等)"""
|
||||
return ChatOpenAI(
|
||||
model=self.model_name,
|
||||
api_key=self._api_key_fn(),
|
||||
base_url=self._base_url_fn(),
|
||||
**kwargs,
|
||||
)
|
||||
@@ -0,0 +1,100 @@
|
||||
"""各步骤 Prompt 模板"""
|
||||
|
||||
# ──────────── 第一次AI:岗位结构化提取 ────────────
|
||||
JOB_STRUCTURE_SYSTEM = """你是一个岗位数据清洗助手。请根据提供的原始岗位数据,提取并结构化为JSON格式。
|
||||
|
||||
返回JSON格式要求:
|
||||
{
|
||||
"valid": true/false,
|
||||
"title": "岗位名称",
|
||||
"salary": "标准化薪资,如10-20K、面议,无效则null",
|
||||
"education": 0-4的数字(0=不限 1=大专 2=本科 3=硕士 4=博士),
|
||||
"minExperience": 最低工作年限数字(不要求则0),
|
||||
"employmentType": 0或1(0=全职 1=兼职,默认0),
|
||||
"categoryId": 岗位分类ID(必选,从分类列表中选最接近的),
|
||||
"requiredIndustryId": 行业ID(仅当明确提到行业经验要求时设置,列表中无完全匹配则选最相似的,未提到则null),
|
||||
"description": "岗位职责,保持原文风格,格式化展示",
|
||||
"requirement": "任职要求,保持原文风格,格式化展示",
|
||||
"bonus": "加分项,无则null",
|
||||
"tags": ["核心职能标签,最多5个,如数据分析、产品策略"],
|
||||
"skillTags": ["技能关键词,最多8个,如Java、Spring Boot"],
|
||||
"companyShortName": "简洁的公司简称,如字节跳动、中国平安",
|
||||
"cities": ["工作城市列表,精确到市"]
|
||||
}
|
||||
|
||||
规则:
|
||||
1. description/requirement/bonus 均从原始的 description+experience+education 内容中提取,保持原文风格
|
||||
2. 岗位标题不存在时,从描述中归纳生成
|
||||
3. 薪资标准化为 10-20K、20K、面议 等格式,无效或空则null
|
||||
4. categoryId 必须从分类列表中选一个,不允许为null
|
||||
5. requiredIndustryId 仅当描述中明确提到行业经验要求时设置
|
||||
6. tags 是核心职能标签(如数据分析、团队协作),最多5个
|
||||
7. skillTags 是技能关键词(如Java、MySQL),最多8个
|
||||
8. companyShortName 去掉地区后缀、招聘后缀、括号内容,保持简洁
|
||||
9. 字符串值中不允许出现Tab、换行等控制字符,用空格或中文标点替代
|
||||
10. 只返回JSON,不要其他内容"""
|
||||
|
||||
|
||||
# ──────────── 第二次AI:专业匹配 ────────────
|
||||
MAJOR_MATCH_SYSTEM = """你是一个岗位专业匹配助手。根据岗位信息,判断该岗位对专业的要求。
|
||||
返回JSON格式:
|
||||
{
|
||||
"requiredMajorIds": [专业ID数组,从专业列表中选择最相关的,最多3个,无明确要求则空数组],
|
||||
"majorSensitivity": 0-2的数字(0=专业不限 1=优先相关专业 2=强制要求专业)
|
||||
}
|
||||
规则:
|
||||
1. 只能从给定专业列表中选择ID
|
||||
2. 根据岗位描述判断专业敏感度:明确写"XX专业"→2,写"相关专业优先"→1,未提及→0
|
||||
3. majorSensitivity为0时,requiredMajorIds应为空数组
|
||||
4. 只返回JSON,不要其他内容"""
|
||||
|
||||
|
||||
# ──────────── 第三次AI:技能提取 ────────────
|
||||
SKILL_EXTRACT_SYSTEM = """你是一个技能提取助手。根据岗位信息,提取该岗位要求的核心专业能力和工具技能。
|
||||
返回JSON数组格式,如:["java", "spring boot", "mysql", "redis"]
|
||||
规则:
|
||||
1. 统一使用小写字母
|
||||
2. 只保留核心词,去掉多余修饰(如"plc编程"→"plc","c语言"→"c","cad制图"→"cad")
|
||||
3. 同一技能只保留最具体的表述,不要同时出现上位词和下位词(如有"机械设计"就不要再出"机械")
|
||||
4. 提取范围包括:技术栈、专业领域知识、行业工具、专业资质能力等
|
||||
5. 不提取纯软技能(如沟通能力、团队协作、学习能力、积极主动)
|
||||
6. 不提取过于宽泛的标签(如"办公软件"、"windows")
|
||||
7. 如果岗位完全没有专业能力要求(纯看态度和素质),返回空数组 []
|
||||
8. 最多15个,按重要性排序
|
||||
9. 只返回JSON数组,不要其他内容"""
|
||||
|
||||
|
||||
# ──────────── 公司补充 ────────────
|
||||
COMPANY_ENRICH_SYSTEM = """你是一个企业信息补充助手。根据提供的公司简称,补充该公司的详细信息。
|
||||
|
||||
返回JSON格式要求:
|
||||
{
|
||||
"valid": true/false,
|
||||
"name": "公司全称",
|
||||
"city": "总部所在城市,精确到市",
|
||||
"companyType": "企业类型",
|
||||
"industryId": 行业ID,
|
||||
"tags": ["公司标签,最多5个"],
|
||||
"summary": "一句话简介,100字以内",
|
||||
"description": "公司详细描述,500字以内",
|
||||
"foundedYear": "成立年份",
|
||||
"address": "总部/注册地址",
|
||||
"scale": "企业规模",
|
||||
"website": "官网地址",
|
||||
"financingStage": "融资状态",
|
||||
"latestValuation": "最新估值",
|
||||
"news": ["相关新闻,最多3条,每条50字以内"]
|
||||
}
|
||||
|
||||
规则:
|
||||
1. 如果不认识该公司,返回 {"valid": false}
|
||||
2. name 根据公司简称推断完整的企业注册名称
|
||||
3. companyType 取值:上市企业、独角兽、国企、央企、民营企业、外资企业、合资企业、事业单位、其他
|
||||
4. industryId 必须从给定行业列表中选择,不确定则null
|
||||
5. scale 取值:少于50人、50-150人、150-500人、500-1000人、1000-5000人、5000-10000人、10000人以上
|
||||
6. tags 体现公司核心业务特征,最多5个
|
||||
7. news 基于你的知识提供该公司最新的3条相关新闻,每条50字以内
|
||||
8. latestValuation 知道就提供,不知道则null
|
||||
9. 不确定的字段返回null,不要编造
|
||||
10. 字符串值中不允许出现Tab、换行等控制字符
|
||||
11. 只返回JSON,不要其他内容"""
|
||||
@@ -0,0 +1,18 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from .settings import Settings
|
||||
|
||||
_env = os.getenv("ENV", "dev")
|
||||
_env_files = {"dev": ".env", "test": ".env.test", "prod": ".env.prod"}
|
||||
|
||||
# 定位项目根目录(config 上两级)
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent
|
||||
_env_file = _project_root / _env_files.get(_env, ".env")
|
||||
|
||||
if not _env_file.exists():
|
||||
raise FileNotFoundError(f".env 文件不存在: {_env_file}")
|
||||
|
||||
settings = Settings(_env_file=str(_env_file))
|
||||
|
||||
__all__ = ["settings"]
|
||||
@@ -0,0 +1,70 @@
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""项目配置,通过 .env 文件覆盖默认值"""
|
||||
|
||||
# 环境
|
||||
env: str = "dev"
|
||||
|
||||
# ──────────── PostgreSQL(本地,app_job_data 源库)────────────
|
||||
pg_host: str = "192.168.31.51"
|
||||
pg_port: int = 5432
|
||||
pg_user: str = "postgres"
|
||||
pg_password: str = ""
|
||||
pg_db: str = "postgres"
|
||||
pg_pool_size: int = 10
|
||||
pg_max_overflow: int = 20
|
||||
|
||||
# ──────────── MySQL(业务库)────────────
|
||||
db_host: str = "192.168.31.105"
|
||||
db_port: int = 3306
|
||||
db_user: str = "root"
|
||||
db_password: str = "123456"
|
||||
db_name: str = "offerpie"
|
||||
mysql_pool_size: int = 10
|
||||
mysql_max_overflow: int = 20
|
||||
|
||||
# ──────────── AI 供应商 ────────────
|
||||
volcengine_api_key: str = "fd065993-bee2-4f31-8bf2-56d5d3012c02"
|
||||
volcengine_base_url: str = "https://ark.cn-beijing.volces.com/api/v3"
|
||||
|
||||
# ──────────── 岗位清洗参数 ────────────
|
||||
clean_batch_size: int = 100
|
||||
clean_concurrency: int = 50
|
||||
clean_interval_seconds: int = 180
|
||||
|
||||
# ──────────── 公司补充参数 ────────────
|
||||
company_batch_size: int = 20
|
||||
company_concurrency: int = 10
|
||||
company_interval_seconds: int = 300
|
||||
|
||||
# ──────────── 岗位下架参数 ────────────
|
||||
job_expire_days: int = 7
|
||||
|
||||
# ──────────── 日志 ────────────
|
||||
logging_level: str = "INFO"
|
||||
log_file_name: str = "cleaner.log"
|
||||
|
||||
@property
|
||||
def pg_url(self) -> str:
|
||||
from urllib.parse import quote
|
||||
return (
|
||||
f"postgresql+asyncpg://{self.pg_user}:{quote(self.pg_password, safe='')}"
|
||||
f"@{self.pg_host}:{self.pg_port}/{self.pg_db}"
|
||||
)
|
||||
|
||||
@property
|
||||
def mysql_url(self) -> str:
|
||||
from urllib.parse import quote
|
||||
return (
|
||||
f"mysql+asyncmy://{self.db_user}:{quote(self.db_password, safe='')}"
|
||||
f"@{self.db_host}:{self.db_port}/{self.db_name}"
|
||||
)
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
case_sensitive=False,
|
||||
extra="ignore",
|
||||
)
|
||||
@@ -0,0 +1,79 @@
|
||||
"""双数据源:PostgreSQL(源库) + MySQL(业务库)"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy.ext.asyncio import (
|
||||
AsyncEngine,
|
||||
AsyncSession,
|
||||
async_sessionmaker,
|
||||
create_async_engine,
|
||||
)
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
|
||||
from app.config import settings
|
||||
from app.core.logger import log
|
||||
|
||||
# ──────────── 内部变量 ────────────
|
||||
_pg_engine: Optional[AsyncEngine] = None
|
||||
_pg_session_factory: Optional[async_sessionmaker[AsyncSession]] = None
|
||||
|
||||
_mysql_engine: Optional[AsyncEngine] = None
|
||||
_mysql_session_factory: Optional[async_sessionmaker[AsyncSession]] = None
|
||||
|
||||
|
||||
class PgBase(DeclarativeBase):
|
||||
"""PostgreSQL ORM 声明基类"""
|
||||
pass
|
||||
|
||||
|
||||
class MysqlBase(DeclarativeBase):
|
||||
"""MySQL ORM 声明基类"""
|
||||
pass
|
||||
|
||||
|
||||
async def init_db() -> None:
|
||||
"""初始化双数据源"""
|
||||
global _pg_engine, _pg_session_factory, _mysql_engine, _mysql_session_factory
|
||||
|
||||
_pg_engine = create_async_engine(
|
||||
settings.pg_url,
|
||||
pool_size=settings.pg_pool_size,
|
||||
max_overflow=settings.pg_max_overflow,
|
||||
pool_recycle=3600,
|
||||
echo=False,
|
||||
)
|
||||
_pg_session_factory = async_sessionmaker(_pg_engine, expire_on_commit=False)
|
||||
|
||||
_mysql_engine = create_async_engine(
|
||||
settings.mysql_url,
|
||||
pool_size=settings.mysql_pool_size,
|
||||
max_overflow=settings.mysql_max_overflow,
|
||||
pool_recycle=3600,
|
||||
echo=False,
|
||||
)
|
||||
_mysql_session_factory = async_sessionmaker(_mysql_engine, expire_on_commit=False)
|
||||
|
||||
log.info("双数据源初始化完成: PG={}, MySQL={}", settings.pg_host, settings.db_host)
|
||||
|
||||
|
||||
async def close_db() -> None:
|
||||
"""关闭双数据源"""
|
||||
if _pg_engine:
|
||||
await _pg_engine.dispose()
|
||||
if _mysql_engine:
|
||||
await _mysql_engine.dispose()
|
||||
log.info("双数据源已关闭")
|
||||
|
||||
|
||||
def PgSession() -> AsyncSession:
|
||||
"""获取 PostgreSQL 异步会话(用作 async with PgSession() as session)"""
|
||||
if _pg_session_factory is None:
|
||||
raise RuntimeError("数据库未初始化,请先调用 init_db()")
|
||||
return _pg_session_factory()
|
||||
|
||||
|
||||
def MysqlSession() -> AsyncSession:
|
||||
"""获取 MySQL 异步会话(用作 async with MysqlSession() as session)"""
|
||||
if _mysql_session_factory is None:
|
||||
raise RuntimeError("数据库未初始化,请先调用 init_db()")
|
||||
return _mysql_session_factory()
|
||||
@@ -0,0 +1,34 @@
|
||||
"""日志配置"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from app.config import settings
|
||||
|
||||
# 日志目录
|
||||
_log_dir = Path("logs")
|
||||
_log_dir.mkdir(exist_ok=True)
|
||||
|
||||
# 移除默认 handler
|
||||
logger.remove()
|
||||
|
||||
# 控制台输出
|
||||
logger.add(
|
||||
sys.stdout,
|
||||
level=settings.logging_level,
|
||||
format="<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>",
|
||||
)
|
||||
|
||||
# 文件输出(按天轮转,保留30天)
|
||||
logger.add(
|
||||
_log_dir / settings.log_file_name,
|
||||
level=settings.logging_level,
|
||||
rotation="00:00",
|
||||
retention="30 days",
|
||||
encoding="utf-8",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{line} - {message}",
|
||||
)
|
||||
|
||||
log = logger
|
||||
+64
@@ -0,0 +1,64 @@
|
||||
"""项目入口:初始化数据源、加载字典、启动调度器"""
|
||||
|
||||
import asyncio
|
||||
import signal
|
||||
import warnings
|
||||
from datetime import datetime
|
||||
|
||||
from app.core.logger import log
|
||||
|
||||
# 屏蔽 asyncmy INSERT IGNORE 产生的 Duplicate entry warnings
|
||||
warnings.filterwarnings("ignore", message=".*Duplicate entry.*")
|
||||
from app.core.database import init_db, close_db
|
||||
from app.services.dict_cache_service import dict_cache
|
||||
from app.scheduler.tasks import create_scheduler
|
||||
|
||||
|
||||
async def main():
|
||||
log.info("=" * 50)
|
||||
log.info("OfferPie Job Cleaner 启动中...")
|
||||
log.info("=" * 50)
|
||||
|
||||
# 初始化双数据源
|
||||
await init_db()
|
||||
|
||||
# 加载字典缓存
|
||||
await dict_cache.refresh()
|
||||
|
||||
# 创建并启动调度器
|
||||
scheduler = create_scheduler()
|
||||
scheduler.start()
|
||||
|
||||
# 立即触发一次岗位清洗和公司补充
|
||||
scheduler.modify_job("job_clean", next_run_time=datetime.now())
|
||||
scheduler.modify_job("company_clean", next_run_time=datetime.now())
|
||||
|
||||
log.info("调度器已启动,所有定时任务已注册")
|
||||
|
||||
# 优雅关闭
|
||||
stop_event = asyncio.Event()
|
||||
|
||||
def _shutdown(*args):
|
||||
log.info("收到关闭信号,正在关闭...")
|
||||
stop_event.set()
|
||||
|
||||
loop = asyncio.get_running_loop()
|
||||
# Unix: SIGINT + SIGTERM,Windows: 仅靠 KeyboardInterrupt
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
try:
|
||||
loop.add_signal_handler(sig, _shutdown)
|
||||
except (NotImplementedError, ValueError):
|
||||
pass
|
||||
|
||||
try:
|
||||
await stop_event.wait()
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
scheduler.shutdown(wait=False)
|
||||
await close_db()
|
||||
log.info("OfferPie Job Cleaner 已关闭")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,36 @@
|
||||
"""MySQL: bg_company 表模型"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import BigInteger, DateTime, Integer, JSON, String, Text
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from app.core.database import MysqlBase
|
||||
|
||||
|
||||
class Company(MysqlBase):
|
||||
"""公司表"""
|
||||
|
||||
__tablename__ = "bg_company"
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
|
||||
name: Mapped[Optional[str]] = mapped_column(String(255))
|
||||
short_name: Mapped[str] = mapped_column(String(128), nullable=False)
|
||||
logo_url: Mapped[Optional[str]] = mapped_column(String(512))
|
||||
region_code: Mapped[Optional[str]] = mapped_column(String(20))
|
||||
company_type: Mapped[Optional[str]] = mapped_column(String(32))
|
||||
industry_id: Mapped[Optional[int]] = mapped_column(BigInteger)
|
||||
tags: Mapped[Optional[list]] = mapped_column(JSON)
|
||||
summary: Mapped[Optional[str]] = mapped_column(String(512))
|
||||
description: Mapped[Optional[str]] = mapped_column(Text)
|
||||
founded_year: Mapped[Optional[str]] = mapped_column(String(10))
|
||||
address: Mapped[Optional[str]] = mapped_column(String(255))
|
||||
scale: Mapped[Optional[str]] = mapped_column(String(32))
|
||||
website: Mapped[Optional[str]] = mapped_column(String(255))
|
||||
financing_stage: Mapped[Optional[str]] = mapped_column(String(32))
|
||||
latest_valuation: Mapped[Optional[str]] = mapped_column(String(64))
|
||||
news: Mapped[Optional[list]] = mapped_column(JSON)
|
||||
status: Mapped[int] = mapped_column(Integer, default=0, comment="0=待完善 1=已完善 2=禁用 3=补充中 4=补充失败")
|
||||
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
update_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
@@ -0,0 +1,39 @@
|
||||
"""MySQL: bg_job 表模型"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import BigInteger, DateTime, Integer, JSON, String, Text
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from app.core.database import MysqlBase
|
||||
|
||||
|
||||
class Job(MysqlBase):
|
||||
"""岗位表"""
|
||||
|
||||
__tablename__ = "bg_job"
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
|
||||
title: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
company_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
category_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
employment_type: Mapped[int] = mapped_column(Integer, default=0)
|
||||
description: Mapped[Optional[str]] = mapped_column(Text)
|
||||
requirement: Mapped[Optional[str]] = mapped_column(Text)
|
||||
bonus: Mapped[Optional[str]] = mapped_column(Text)
|
||||
tags: Mapped[Optional[list]] = mapped_column(JSON)
|
||||
skill_tags: Mapped[Optional[list]] = mapped_column(JSON)
|
||||
salary: Mapped[Optional[str]] = mapped_column(String(64))
|
||||
education: Mapped[int] = mapped_column(Integer, default=0)
|
||||
min_experience: Mapped[int] = mapped_column(Integer, default=0)
|
||||
required_industry_id: Mapped[Optional[int]] = mapped_column(BigInteger)
|
||||
required_major_ids: Mapped[Optional[list]] = mapped_column(JSON)
|
||||
major_sensitivity: Mapped[Optional[int]] = mapped_column(Integer)
|
||||
source_url: Mapped[Optional[str]] = mapped_column(String(1024))
|
||||
source_id: Mapped[Optional[str]] = mapped_column(String(64))
|
||||
recruit_category: Mapped[Optional[int]] = mapped_column(Integer, comment="招聘分类: 0=校招, 1=实习, 2=社招, 3=其他")
|
||||
expire_at: Mapped[Optional[datetime]] = mapped_column(DateTime, comment="发布日期")
|
||||
status: Mapped[int] = mapped_column(Integer, default=0, comment="0=上架 1=下架 2=已失效")
|
||||
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
update_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
@@ -0,0 +1,30 @@
|
||||
"""MySQL: 关联表模型"""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import BigInteger, DateTime, String
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from app.core.database import MysqlBase
|
||||
|
||||
|
||||
class JobRegionRelation(MysqlBase):
|
||||
"""岗位-地区关联表"""
|
||||
|
||||
__tablename__ = "bg_job_region_relation"
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
job_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
region_code: Mapped[str] = mapped_column(String(20), nullable=False)
|
||||
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
|
||||
|
||||
class JobSkillTagRelation(MysqlBase):
|
||||
"""岗位-技能标签关联表"""
|
||||
|
||||
__tablename__ = "bg_job_skill_tag_relation"
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
job_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
skill_tag_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
|
||||
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
@@ -0,0 +1,15 @@
|
||||
"""MySQL: bg_skill_tag 表模型"""
|
||||
|
||||
from sqlalchemy import BigInteger, String
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from app.core.database import MysqlBase
|
||||
|
||||
|
||||
class SkillTag(MysqlBase):
|
||||
"""技能标签表"""
|
||||
|
||||
__tablename__ = "bg_skill_tag"
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
|
||||
name: Mapped[str] = mapped_column(String(100), unique=True, nullable=False)
|
||||
@@ -0,0 +1,35 @@
|
||||
"""PostgreSQL: app_job_data 表模型"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import BigInteger, DateTime, Integer, SmallInteger, String, Text
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from app.core.database import PgBase
|
||||
|
||||
|
||||
class AppJobData(PgBase):
|
||||
"""爬虫岗位原始数据"""
|
||||
|
||||
__tablename__ = "app_job_data"
|
||||
|
||||
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
|
||||
urllistid: Mapped[int] = mapped_column(BigInteger, nullable=False, comment="关联urllistid")
|
||||
job_title: Mapped[Optional[str]] = mapped_column(String(255))
|
||||
salary: Mapped[Optional[str]] = mapped_column(String(128))
|
||||
location: Mapped[Optional[str]] = mapped_column(String(2048))
|
||||
company: Mapped[Optional[str]] = mapped_column(String(255), comment="公司名字")
|
||||
experience: Mapped[Optional[str]] = mapped_column(String(64))
|
||||
education: Mapped[Optional[str]] = mapped_column(String(64))
|
||||
description: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
detail_url: Mapped[str] = mapped_column(String(1024), nullable=False)
|
||||
recruit_category: Mapped[int] = mapped_column(SmallInteger, default=3, nullable=False, comment="招聘分类: 0=校招, 1=实习, 2=社招, 3=其他")
|
||||
content_hash: Mapped[str] = mapped_column(String(64), nullable=False)
|
||||
sources: Mapped[int] = mapped_column(SmallInteger, default=0, nullable=False)
|
||||
expire_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="发布日期")
|
||||
created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
|
||||
clean_status: Mapped[str] = mapped_column(String(20), default="pending", nullable=False, comment="pending/cleaning/cleaned/discarded")
|
||||
clean_started_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
||||
cleaned_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
|
||||
@@ -0,0 +1,105 @@
|
||||
"""定时任务注册"""
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.interval import IntervalTrigger
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
|
||||
from app.config import settings
|
||||
from app.core.logger import log
|
||||
|
||||
|
||||
def create_scheduler() -> AsyncIOScheduler:
|
||||
"""创建并注册所有定时任务"""
|
||||
scheduler = AsyncIOScheduler(
|
||||
timezone="Asia/Shanghai",
|
||||
job_defaults={"misfire_grace_time": 60},
|
||||
)
|
||||
|
||||
# 岗位清洗(每 N 秒)
|
||||
scheduler.add_job(
|
||||
_job_clean_task,
|
||||
trigger=IntervalTrigger(seconds=settings.clean_interval_seconds),
|
||||
id="job_clean",
|
||||
name="岗位清洗",
|
||||
max_instances=1,
|
||||
)
|
||||
|
||||
# 公司补充(每 N 秒)
|
||||
scheduler.add_job(
|
||||
_company_clean_task,
|
||||
trigger=IntervalTrigger(seconds=settings.company_interval_seconds),
|
||||
id="company_clean",
|
||||
name="公司补充",
|
||||
max_instances=1,
|
||||
)
|
||||
|
||||
# 岗位僵尸恢复(每30分钟)
|
||||
scheduler.add_job(
|
||||
_job_zombie_task,
|
||||
trigger=IntervalTrigger(minutes=30),
|
||||
id="job_zombie_recover",
|
||||
name="岗位僵尸恢复",
|
||||
max_instances=1,
|
||||
)
|
||||
|
||||
# 公司僵尸恢复(每小时)
|
||||
scheduler.add_job(
|
||||
_company_zombie_task,
|
||||
trigger=IntervalTrigger(hours=1),
|
||||
id="company_zombie_recover",
|
||||
name="公司僵尸恢复",
|
||||
max_instances=1,
|
||||
)
|
||||
|
||||
# 岗位下架(每天凌晨2点)
|
||||
scheduler.add_job(
|
||||
_job_expire_task,
|
||||
trigger=CronTrigger(hour=2, minute=0),
|
||||
id="job_expire",
|
||||
name="岗位下架",
|
||||
max_instances=1,
|
||||
)
|
||||
|
||||
return scheduler
|
||||
|
||||
|
||||
async def _job_clean_task():
|
||||
from app.services.job_clean_service import run_job_clean
|
||||
try:
|
||||
await run_job_clean()
|
||||
except Exception as e:
|
||||
log.error("岗位清洗任务异常: {}", e)
|
||||
|
||||
|
||||
async def _company_clean_task():
|
||||
from app.services.company_clean_service import run_company_clean
|
||||
try:
|
||||
await run_company_clean()
|
||||
except Exception as e:
|
||||
log.error("公司补充任务异常: {}", e)
|
||||
|
||||
|
||||
async def _job_zombie_task():
|
||||
from app.services.zombie_recover_service import recover_job_zombie
|
||||
try:
|
||||
await recover_job_zombie()
|
||||
except Exception as e:
|
||||
log.error("岗位僵尸恢复异常: {}", e)
|
||||
|
||||
|
||||
async def _company_zombie_task():
|
||||
from app.services.zombie_recover_service import recover_company_zombie
|
||||
try:
|
||||
await recover_company_zombie()
|
||||
except Exception as e:
|
||||
log.error("公司僵尸恢复异常: {}", e)
|
||||
|
||||
|
||||
async def _job_expire_task():
|
||||
from app.services.job_expire_service import run_job_expire
|
||||
try:
|
||||
await run_job_expire()
|
||||
except Exception as e:
|
||||
log.error("岗位下架异常: {}", e)
|
||||
@@ -0,0 +1,65 @@
|
||||
"""AI 调用工具封装"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
|
||||
from app.core.logger import log
|
||||
|
||||
# markdown 代码块正则
|
||||
_CODE_BLOCK_RE = re.compile(r"```\w*\s*\n?(.*?)\n?\s*```", re.DOTALL)
|
||||
# 控制字符正则(保留 \t \n \r)
|
||||
_CONTROL_CHAR_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
|
||||
|
||||
|
||||
def clean_ai_response(response: str) -> str:
|
||||
"""从 AI 返回的文本中提取干净的 JSON 字符串"""
|
||||
if not response or not response.strip():
|
||||
return ""
|
||||
|
||||
result = response.strip()
|
||||
|
||||
# 尝试从 markdown 代码块提取
|
||||
match = _CODE_BLOCK_RE.search(result)
|
||||
if match:
|
||||
result = match.group(1).strip()
|
||||
else:
|
||||
# 定位首个 JSON 起始符
|
||||
obj_start = result.find("{")
|
||||
arr_start = result.find("[")
|
||||
if obj_start < 0:
|
||||
start = arr_start
|
||||
elif arr_start < 0:
|
||||
start = obj_start
|
||||
else:
|
||||
start = min(obj_start, arr_start)
|
||||
if start > 0:
|
||||
result = result[start:]
|
||||
|
||||
# 清除控制字符
|
||||
result = _CONTROL_CHAR_RE.sub("", result)
|
||||
return result
|
||||
|
||||
|
||||
async def ai_chat(llm: ChatOpenAI, system_prompt: str, user_message: str) -> str:
|
||||
"""异步调用 LLM,返回原始文本"""
|
||||
messages = [
|
||||
SystemMessage(content=system_prompt),
|
||||
HumanMessage(content=user_message),
|
||||
]
|
||||
response = await llm.ainvoke(messages)
|
||||
return response.content
|
||||
|
||||
|
||||
async def ai_chat_json(llm: ChatOpenAI, system_prompt: str, user_message: str) -> Any:
|
||||
"""异步调用 LLM,返回解析后的 JSON 对象"""
|
||||
raw = await ai_chat(llm, system_prompt, user_message)
|
||||
cleaned = clean_ai_response(raw)
|
||||
try:
|
||||
return json.loads(cleaned)
|
||||
except json.JSONDecodeError as e:
|
||||
log.warning("AI JSON 解析失败: {}, raw={}", e, raw[:200])
|
||||
return None
|
||||
@@ -0,0 +1,138 @@
|
||||
"""公司数据补充服务(协程版)"""
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.config import settings
|
||||
from app.core.database import MysqlSession
|
||||
from app.core.logger import log
|
||||
from app.ai.model_config import CompanyCleanModel
|
||||
from app.ai.prompts import COMPANY_ENRICH_SYSTEM
|
||||
from app.services.ai_tool import ai_chat_json
|
||||
from app.services.dict_cache_service import dict_cache
|
||||
|
||||
|
||||
async def run_company_clean() -> None:
|
||||
"""一次批量公司补充任务"""
|
||||
# 锁定一批待完善公司
|
||||
async with MysqlSession() as mysql:
|
||||
result = await mysql.execute(
|
||||
text("""
|
||||
SELECT * FROM bg_company
|
||||
WHERE status = 0
|
||||
LIMIT :limit
|
||||
FOR UPDATE SKIP LOCKED
|
||||
"""),
|
||||
{"limit": settings.company_batch_size},
|
||||
)
|
||||
rows = result.mappings().all()
|
||||
if not rows:
|
||||
return
|
||||
|
||||
ids = [r["id"] for r in rows]
|
||||
# MySQL 批量 IN 用 format 拼接(id 是 bigint,安全)
|
||||
ids_str = ",".join(str(i) for i in ids)
|
||||
await mysql.execute(
|
||||
text(f"UPDATE bg_company SET status = 3, update_time = NOW() WHERE id IN ({ids_str})"),
|
||||
)
|
||||
await mysql.commit()
|
||||
|
||||
log.info("公司补充:锁定{}条数据", len(rows))
|
||||
|
||||
# 协程并发,信号量限流
|
||||
sem = asyncio.Semaphore(settings.company_concurrency)
|
||||
tasks = [_clean_one(sem, dict(r)) for r in rows]
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
|
||||
async def _clean_one(sem: asyncio.Semaphore, company: dict) -> None:
|
||||
"""单条公司补充"""
|
||||
async with sem:
|
||||
try:
|
||||
await _do_clean(company)
|
||||
except Exception as e:
|
||||
log.error("公司补充异常, id={}, shortName={}: {}", company["id"], company.get("short_name"), e)
|
||||
|
||||
|
||||
async def _do_clean(company: dict) -> None:
|
||||
"""公司补充逻辑"""
|
||||
company_id = company["id"]
|
||||
short_name = company.get("short_name", "")
|
||||
|
||||
user_msg = f"【公司简称】\n{short_name}\n\n【行业列表】\n{dict_cache.industry_text}"
|
||||
result = await ai_chat_json(CompanyCleanModel.ENRICH, COMPANY_ENRICH_SYSTEM, user_msg)
|
||||
|
||||
if result is None or not result.get("valid", False):
|
||||
await _update_status(company_id, 4)
|
||||
return
|
||||
|
||||
# 地区匹配
|
||||
city = result.get("city")
|
||||
region_code = dict_cache.match_region_code(city) if city else None
|
||||
|
||||
# 回填数据
|
||||
now = datetime.now()
|
||||
async with MysqlSession() as mysql:
|
||||
await mysql.execute(
|
||||
text("""
|
||||
UPDATE bg_company SET
|
||||
name = COALESCE(:name, name),
|
||||
region_code = COALESCE(:region_code, region_code),
|
||||
company_type = COALESCE(:company_type, company_type),
|
||||
industry_id = :industry_id,
|
||||
tags = :tags,
|
||||
summary = COALESCE(:summary, summary),
|
||||
description = COALESCE(:description, description),
|
||||
founded_year = COALESCE(:founded_year, founded_year),
|
||||
address = COALESCE(:address, address),
|
||||
scale = COALESCE(:scale, scale),
|
||||
website = COALESCE(:website, website),
|
||||
financing_stage = COALESCE(:financing_stage, financing_stage),
|
||||
latest_valuation = COALESCE(:latest_valuation, latest_valuation),
|
||||
news = :news,
|
||||
status = 1,
|
||||
update_time = :now
|
||||
WHERE id = :id
|
||||
"""),
|
||||
{
|
||||
"name": result.get("name"),
|
||||
"region_code": region_code,
|
||||
"company_type": result.get("companyType"),
|
||||
"industry_id": result.get("industryId"),
|
||||
"tags": _to_json(result.get("tags")),
|
||||
"summary": result.get("summary"),
|
||||
"description": result.get("description"),
|
||||
"founded_year": result.get("foundedYear"),
|
||||
"address": result.get("address"),
|
||||
"scale": result.get("scale"),
|
||||
"website": result.get("website"),
|
||||
"financing_stage": result.get("financingStage"),
|
||||
"latest_valuation": result.get("latestValuation"),
|
||||
"news": _to_json(result.get("news")),
|
||||
"now": now,
|
||||
"id": company_id,
|
||||
},
|
||||
)
|
||||
await mysql.commit()
|
||||
|
||||
log.info("公司补充完成, id={}, shortName={}", company_id, short_name)
|
||||
|
||||
|
||||
async def _update_status(company_id: int, status: int) -> None:
|
||||
"""更新公司状态"""
|
||||
async with MysqlSession() as mysql:
|
||||
await mysql.execute(
|
||||
text("UPDATE bg_company SET status = :s, update_time = NOW() WHERE id = :id"),
|
||||
{"s": status, "id": company_id},
|
||||
)
|
||||
await mysql.commit()
|
||||
|
||||
|
||||
def _to_json(value) -> str | None:
|
||||
"""列表转 JSON 字符串"""
|
||||
import json
|
||||
if value and isinstance(value, list):
|
||||
return json.dumps(value, ensure_ascii=False)
|
||||
return None
|
||||
@@ -0,0 +1,91 @@
|
||||
"""字典数据缓存服务
|
||||
|
||||
启动时从 MySQL 加载岗位分类、行业、专业分类、地区数据到内存。
|
||||
"""
|
||||
|
||||
from sqlalchemy import select, text
|
||||
|
||||
from app.core.database import MysqlSession
|
||||
from app.core.logger import log
|
||||
|
||||
|
||||
class DictCacheService:
|
||||
"""字典缓存,单例使用"""
|
||||
|
||||
def __init__(self):
|
||||
self.job_category_text: str = ""
|
||||
self.industry_text: str = ""
|
||||
self.major_category_text: str = ""
|
||||
self._region_list: list[dict] = []
|
||||
|
||||
async def refresh(self) -> None:
|
||||
"""加载全量字典数据"""
|
||||
async with MysqlSession() as session:
|
||||
# 岗位分类(三级叶子,带父级路径)
|
||||
result = await session.execute(text("""
|
||||
SELECT c.id, c.name, c.parent_id, c.root_id, c.level,
|
||||
p.name AS parent_name, r.name AS root_name
|
||||
FROM bg_job_category c
|
||||
LEFT JOIN bg_job_category p ON c.parent_id = p.id
|
||||
LEFT JOIN bg_job_category r ON c.root_id = r.id
|
||||
WHERE c.level = 3
|
||||
"""))
|
||||
categories = result.mappings().all()
|
||||
self.job_category_text = ", ".join(
|
||||
f"{c['id']}:{c['name']}({c['root_name']}/{c['parent_name']})"
|
||||
for c in categories
|
||||
)
|
||||
|
||||
# 行业(二级叶子,带父级)
|
||||
result = await session.execute(text("""
|
||||
SELECT i.id, i.name, p.name AS parent_name
|
||||
FROM bg_industry i
|
||||
LEFT JOIN bg_industry p ON i.parent_id = p.id
|
||||
WHERE i.level = 2
|
||||
"""))
|
||||
industries = result.mappings().all()
|
||||
self.industry_text = ", ".join(
|
||||
f"{i['id']}:{i['name']}({i['parent_name']})"
|
||||
for i in industries
|
||||
)
|
||||
|
||||
# 专业分类(三级叶子,带父级路径)
|
||||
result = await session.execute(text("""
|
||||
SELECT m.id, m.name, m.parent_id, m.root_id,
|
||||
p.name AS parent_name, r.name AS root_name
|
||||
FROM bg_major_category m
|
||||
LEFT JOIN bg_major_category p ON m.parent_id = p.id
|
||||
LEFT JOIN bg_major_category r ON m.root_id = r.id
|
||||
WHERE m.level = 3
|
||||
"""))
|
||||
majors = result.mappings().all()
|
||||
self.major_category_text = ", ".join(
|
||||
f"{m['id']}:{m['name']}({m['root_name']}/{m['parent_name']})"
|
||||
for m in majors
|
||||
)
|
||||
|
||||
# 地区(省市级)
|
||||
result = await session.execute(text("""
|
||||
SELECT code, name FROM bg_china_regions_code WHERE city_code IS NULL
|
||||
"""))
|
||||
self._region_list = [dict(r) for r in result.mappings().all()]
|
||||
|
||||
log.info(
|
||||
"字典缓存加载完成: 岗位分类{}条, 行业{}条, 专业{}条, 地区{}条",
|
||||
len(categories), len(industries), len(majors), len(self._region_list),
|
||||
)
|
||||
|
||||
def match_region_code(self, city_name: str) -> str | None:
|
||||
"""根据城市名模糊匹配地区编码"""
|
||||
if not city_name:
|
||||
return None
|
||||
name = city_name.replace("市", "").replace("省", "").strip()
|
||||
for r in self._region_list:
|
||||
r_name = r["name"].replace("市", "").replace("省", "")
|
||||
if name in r_name or r_name in name:
|
||||
return r["code"]
|
||||
return None
|
||||
|
||||
|
||||
# 全局单例
|
||||
dict_cache = DictCacheService()
|
||||
@@ -0,0 +1,306 @@
|
||||
"""岗位清洗服务(协程版)"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from snowflake import SnowflakeGenerator
|
||||
from sqlalchemy import text, insert
|
||||
|
||||
from app.config import settings
|
||||
from app.core.database import PgSession, MysqlSession
|
||||
from app.core.logger import log
|
||||
from app.ai.model_config import JobCleanModel
|
||||
from app.ai.prompts import JOB_STRUCTURE_SYSTEM, MAJOR_MATCH_SYSTEM, SKILL_EXTRACT_SYSTEM
|
||||
from app.models.mysql.job import Job
|
||||
from app.models.mysql.company import Company
|
||||
from app.models.mysql.relations import JobRegionRelation, JobSkillTagRelation
|
||||
from app.services.ai_tool import ai_chat_json
|
||||
from app.services.dict_cache_service import dict_cache
|
||||
|
||||
# 雪花ID生成器
|
||||
_id_gen = SnowflakeGenerator(instance=1)
|
||||
|
||||
# 公司创建锁(防止并发重复插入同一公司)
|
||||
_company_lock = asyncio.Lock()
|
||||
|
||||
|
||||
async def run_job_clean() -> None:
|
||||
"""一次批量清洗任务"""
|
||||
# 1. 从 PG 锁定一批待清洗数据
|
||||
async with PgSession() as pg:
|
||||
result = await pg.execute(
|
||||
text("""
|
||||
SELECT * FROM app_job_data
|
||||
WHERE clean_status = 'pending'
|
||||
LIMIT :limit
|
||||
FOR UPDATE SKIP LOCKED
|
||||
"""),
|
||||
{"limit": settings.clean_batch_size},
|
||||
)
|
||||
rows = result.mappings().all()
|
||||
if not rows:
|
||||
return
|
||||
|
||||
ids = [r["id"] for r in rows]
|
||||
await pg.execute(
|
||||
text("""
|
||||
UPDATE app_job_data
|
||||
SET clean_status = 'cleaning', clean_started_at = NOW()
|
||||
WHERE id = ANY(:ids)
|
||||
"""),
|
||||
{"ids": ids},
|
||||
)
|
||||
await pg.commit()
|
||||
|
||||
log.info("岗位清洗:锁定{}条数据", len(rows))
|
||||
|
||||
# 2. 协程并发清洗,信号量限流
|
||||
sem = asyncio.Semaphore(settings.clean_concurrency)
|
||||
tasks = [_clean_one(sem, dict(r)) for r in rows]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# 汇总
|
||||
errors = sum(1 for r in results if isinstance(r, Exception))
|
||||
log.info("岗位清洗:本批完成,共{}条,异常{}条", len(rows), errors)
|
||||
|
||||
|
||||
async def _clean_one(sem: asyncio.Semaphore, data: dict) -> None:
|
||||
"""单条岗位清洗"""
|
||||
async with sem:
|
||||
try:
|
||||
await _do_clean(data)
|
||||
except Exception as e:
|
||||
log.error("岗位清洗异常, id={}: {}", data["id"], e)
|
||||
# 保持 cleaning 状态,由僵尸恢复任务重置
|
||||
|
||||
|
||||
async def _do_clean(data: dict) -> None:
|
||||
"""清洗逻辑"""
|
||||
data_id = data["id"]
|
||||
|
||||
# 前置校验
|
||||
description = data.get("description") or ""
|
||||
if len(description) < 20:
|
||||
log.info("[id={}] 丢弃:描述过短({}字符)", data_id, len(description))
|
||||
await _update_pg_status(data_id, "discarded")
|
||||
return
|
||||
|
||||
# 第一次AI:结构化提取
|
||||
user_message = _build_user_message(data)
|
||||
result = await ai_chat_json(JobCleanModel.STRUCTURE, JOB_STRUCTURE_SYSTEM, user_message)
|
||||
if result is None or not result.get("valid", False):
|
||||
log.info("[id={}] 丢弃:AI判定无效", data_id)
|
||||
await _update_pg_status(data_id, "discarded")
|
||||
return
|
||||
|
||||
# 去重检查
|
||||
source_id = str(data_id)
|
||||
async with MysqlSession() as mysql:
|
||||
existing = await mysql.execute(
|
||||
text("SELECT COUNT(*) AS cnt FROM bg_job WHERE source_id = :sid"),
|
||||
{"sid": source_id},
|
||||
)
|
||||
if existing.scalar() > 0:
|
||||
log.info("[id={}] 跳过:已入库(去重)", data_id)
|
||||
await _update_pg_status(data_id, "cleaned")
|
||||
return
|
||||
|
||||
# 公司处理
|
||||
company_short_name = result.get("companyShortName") or data.get("company") or ""
|
||||
company_id = await _find_or_create_company(company_short_name)
|
||||
|
||||
# 地区处理
|
||||
region_codes = []
|
||||
for city in result.get("cities") or []:
|
||||
code = dict_cache.match_region_code(city)
|
||||
if code:
|
||||
region_codes.append(code)
|
||||
|
||||
# 写入 bg_job
|
||||
job_id = next(_id_gen)
|
||||
now = datetime.now()
|
||||
async with MysqlSession() as mysql:
|
||||
await mysql.execute(
|
||||
insert(Job).values(
|
||||
id=job_id,
|
||||
title=result.get("title", ""),
|
||||
company_id=company_id,
|
||||
category_id=result.get("categoryId", 0),
|
||||
employment_type=result.get("employmentType", 0),
|
||||
description=result.get("description", ""),
|
||||
requirement=result.get("requirement", ""),
|
||||
bonus=result.get("bonus"),
|
||||
tags=result.get("tags"),
|
||||
skill_tags=result.get("skillTags"),
|
||||
salary=result.get("salary"),
|
||||
education=result.get("education", 0),
|
||||
min_experience=result.get("minExperience", 0),
|
||||
required_industry_id=result.get("requiredIndustryId"),
|
||||
recruit_category=data.get("recruit_category", 3),
|
||||
expire_at=data.get("expire_at"),
|
||||
source_url=data.get("detail_url"),
|
||||
source_id=source_id,
|
||||
status=0,
|
||||
create_time=now,
|
||||
update_time=now,
|
||||
)
|
||||
)
|
||||
|
||||
# 写入地区关联
|
||||
if region_codes:
|
||||
await mysql.execute(
|
||||
insert(JobRegionRelation),
|
||||
[{"id": next(_id_gen), "job_id": job_id, "region_code": code, "create_time": now} for code in region_codes],
|
||||
)
|
||||
|
||||
await mysql.commit()
|
||||
|
||||
# 更新 PG 状态
|
||||
await _update_pg_status(data_id, "cleaned")
|
||||
log.info("[id={}] 入库成功:{} | 公司={} | 地区={}", data_id, result.get("title"), company_short_name, region_codes)
|
||||
|
||||
# 第二次AI:专业匹配(失败不影响)
|
||||
try:
|
||||
await _match_major(job_id, result)
|
||||
log.debug("[id={}] 专业匹配完成", data_id)
|
||||
except Exception as e:
|
||||
log.warning("[id={}] 专业匹配失败: {}", data_id, e)
|
||||
|
||||
# 第三次AI:技能提取(失败不影响)
|
||||
try:
|
||||
await _extract_skill_tags(job_id, result)
|
||||
log.debug("[id={}] 技能提取完成", data_id)
|
||||
except Exception as e:
|
||||
log.warning("[id={}] 技能提取失败: {}", data_id, e)
|
||||
|
||||
|
||||
async def _match_major(job_id: int, result: dict) -> None:
|
||||
"""第二次AI:专业匹配"""
|
||||
title = result.get("title", "")
|
||||
desc = result.get("description", "")
|
||||
req = result.get("requirement", "")
|
||||
user_msg = f"【岗位信息】\n标题: {title}\n职责: {desc}\n要求: {req}\n\n【专业分类列表】\n{dict_cache.major_category_text}"
|
||||
|
||||
data = await ai_chat_json(JobCleanModel.MAJOR_MATCH, MAJOR_MATCH_SYSTEM, user_msg)
|
||||
if data is None:
|
||||
return
|
||||
|
||||
major_ids = [mid for mid in (data.get("requiredMajorIds") or []) if mid > 0]
|
||||
sensitivity = data.get("majorSensitivity", 0)
|
||||
|
||||
async with MysqlSession() as mysql:
|
||||
await mysql.execute(
|
||||
text("""
|
||||
UPDATE bg_job SET required_major_ids = :ids, major_sensitivity = :s, update_time = :t
|
||||
WHERE id = :jid
|
||||
"""),
|
||||
{"ids": json.dumps(major_ids) if major_ids else None, "s": sensitivity, "t": datetime.now(), "jid": job_id},
|
||||
)
|
||||
await mysql.commit()
|
||||
|
||||
|
||||
async def _extract_skill_tags(job_id: int, result: dict) -> None:
|
||||
"""第三次AI:技能提取"""
|
||||
title = result.get("title", "")
|
||||
desc = result.get("description", "")
|
||||
req = result.get("requirement", "")
|
||||
user_msg = f"【岗位信息】\n标题: {title}\n职责: {desc}\n要求: {req}"
|
||||
|
||||
skills = await ai_chat_json(JobCleanModel.SKILL_EXTRACT, SKILL_EXTRACT_SYSTEM, user_msg)
|
||||
if not skills or not isinstance(skills, list):
|
||||
return
|
||||
|
||||
now = datetime.now()
|
||||
tag_ids = []
|
||||
async with MysqlSession() as mysql:
|
||||
for name in skills:
|
||||
name = str(name).strip().lower()
|
||||
if not name or len(name) > 50:
|
||||
continue
|
||||
|
||||
tag_id = next(_id_gen)
|
||||
# INSERT IGNORE
|
||||
await mysql.execute(
|
||||
text("INSERT IGNORE INTO bg_skill_tag (id, name) VALUES (:id, :name)"),
|
||||
{"id": tag_id, "name": name},
|
||||
)
|
||||
# 查回真实ID
|
||||
row = await mysql.execute(
|
||||
text("SELECT id FROM bg_skill_tag WHERE name = :name LIMIT 1"),
|
||||
{"name": name},
|
||||
)
|
||||
real_id = row.scalar()
|
||||
if real_id and real_id not in tag_ids:
|
||||
tag_ids.append(real_id)
|
||||
|
||||
if tag_ids:
|
||||
await mysql.execute(
|
||||
insert(JobSkillTagRelation),
|
||||
[{"id": next(_id_gen), "job_id": job_id, "skill_tag_id": tid, "create_time": now} for tid in tag_ids],
|
||||
)
|
||||
|
||||
await mysql.commit()
|
||||
|
||||
|
||||
async def _find_or_create_company(short_name: str) -> int:
|
||||
"""查找或创建公司(加锁防并发重复)"""
|
||||
async with _company_lock:
|
||||
async with MysqlSession() as mysql:
|
||||
row = await mysql.execute(
|
||||
text("SELECT id FROM bg_company WHERE short_name = :name LIMIT 1"),
|
||||
{"name": short_name},
|
||||
)
|
||||
existing = row.scalar()
|
||||
if existing:
|
||||
return existing
|
||||
|
||||
company_id = next(_id_gen)
|
||||
now = datetime.now()
|
||||
await mysql.execute(
|
||||
insert(Company).values(
|
||||
id=company_id,
|
||||
name=short_name,
|
||||
short_name=short_name,
|
||||
status=0,
|
||||
create_time=now,
|
||||
update_time=now,
|
||||
)
|
||||
)
|
||||
await mysql.commit()
|
||||
return company_id
|
||||
|
||||
|
||||
async def _update_pg_status(data_id: int, status: str) -> None:
|
||||
"""更新 PG 清洗状态"""
|
||||
async with PgSession() as pg:
|
||||
if status == "cleaned":
|
||||
await pg.execute(
|
||||
text("UPDATE app_job_data SET clean_status = :s, cleaned_at = NOW() WHERE id = :id"),
|
||||
{"s": status, "id": data_id},
|
||||
)
|
||||
else:
|
||||
await pg.execute(
|
||||
text("UPDATE app_job_data SET clean_status = :s WHERE id = :id"),
|
||||
{"s": status, "id": data_id},
|
||||
)
|
||||
await pg.commit()
|
||||
|
||||
|
||||
def _build_user_message(data: dict) -> str:
|
||||
"""构建第一次AI的用户消息"""
|
||||
parts = [
|
||||
"【原始数据】",
|
||||
f"岗位名称: {data.get('job_title') or ''}",
|
||||
f"薪资: {data.get('salary') or ''}",
|
||||
f"工作地点: {data.get('location') or ''}",
|
||||
f"公司: {data.get('company') or ''}",
|
||||
f"经验要求: {data.get('experience') or ''}",
|
||||
f"学历要求: {data.get('education') or ''}",
|
||||
f"岗位详情: {data.get('description') or ''}",
|
||||
"",
|
||||
f"【岗位分类列表】\n{dict_cache.job_category_text}",
|
||||
"",
|
||||
f"【行业列表】\n{dict_cache.industry_text}",
|
||||
]
|
||||
return "\n".join(parts)
|
||||
@@ -0,0 +1,29 @@
|
||||
"""岗位下架服务
|
||||
|
||||
每天定时执行,将 create_time 超过 N 天的岗位标记为已失效。
|
||||
"""
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.config import settings
|
||||
from app.core.database import MysqlSession
|
||||
from app.core.logger import log
|
||||
|
||||
|
||||
async def run_job_expire() -> None:
|
||||
"""下架过期岗位"""
|
||||
days = int(settings.job_expire_days)
|
||||
async with MysqlSession() as mysql:
|
||||
result = await mysql.execute(
|
||||
text(f"""
|
||||
UPDATE bg_job
|
||||
SET status = 2, update_time = NOW()
|
||||
WHERE status = 0
|
||||
AND create_time < DATE_SUB(NOW(), INTERVAL {days} DAY)
|
||||
"""),
|
||||
)
|
||||
await mysql.commit()
|
||||
affected = result.rowcount
|
||||
|
||||
if affected > 0:
|
||||
log.info("岗位下架:{}条岗位已标记为失效(超过{}天)", affected, days)
|
||||
@@ -0,0 +1,42 @@
|
||||
"""僵尸恢复服务"""
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from app.core.database import PgSession, MysqlSession
|
||||
from app.core.logger import log
|
||||
|
||||
|
||||
async def recover_job_zombie() -> None:
|
||||
"""岗位清洗僵尸恢复:超时10分钟的 cleaning → pending"""
|
||||
async with PgSession() as pg:
|
||||
result = await pg.execute(
|
||||
text("""
|
||||
UPDATE app_job_data
|
||||
SET clean_status = 'pending', clean_started_at = NULL
|
||||
WHERE clean_status = 'cleaning'
|
||||
AND clean_started_at < NOW() - INTERVAL '10 minutes'
|
||||
""")
|
||||
)
|
||||
await pg.commit()
|
||||
affected = result.rowcount
|
||||
|
||||
if affected > 0:
|
||||
log.info("岗位僵尸恢复:重置{}条数据", affected)
|
||||
|
||||
|
||||
async def recover_company_zombie() -> None:
|
||||
"""公司补充僵尸恢复:超时10分钟的 status=3 → 0"""
|
||||
async with MysqlSession() as mysql:
|
||||
result = await mysql.execute(
|
||||
text("""
|
||||
UPDATE bg_company
|
||||
SET status = 0, update_time = NOW()
|
||||
WHERE status = 3
|
||||
AND update_time < NOW() - INTERVAL 10 MINUTE
|
||||
""")
|
||||
)
|
||||
await mysql.commit()
|
||||
affected = result.rowcount
|
||||
|
||||
if affected > 0:
|
||||
log.info("公司僵尸恢复:重置{}条数据", affected)
|
||||
Reference in New Issue
Block a user