补充文档,修改清洗方案

This commit is contained in:
zk
2026-06-02 18:38:36 +08:00
parent 30e6a6e2a5
commit 6967e4ba54
7 changed files with 363 additions and 80 deletions
+19 -19
View File
@@ -14,23 +14,23 @@ class Company(MysqlBase):
__tablename__ = "bg_company"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
name: Mapped[Optional[str]] = mapped_column(String(255))
short_name: Mapped[str] = mapped_column(String(128), nullable=False)
logo_url: Mapped[Optional[str]] = mapped_column(String(512))
region_code: Mapped[Optional[str]] = mapped_column(String(20))
company_type: Mapped[Optional[str]] = mapped_column(String(32))
industry_id: Mapped[Optional[int]] = mapped_column(BigInteger)
tags: Mapped[Optional[list]] = mapped_column(JSON)
summary: Mapped[Optional[str]] = mapped_column(String(512))
description: Mapped[Optional[str]] = mapped_column(Text)
founded_year: Mapped[Optional[str]] = mapped_column(String(10))
address: Mapped[Optional[str]] = mapped_column(String(255))
scale: Mapped[Optional[str]] = mapped_column(String(32))
website: Mapped[Optional[str]] = mapped_column(String(255))
financing_stage: Mapped[Optional[str]] = mapped_column(String(32))
latest_valuation: Mapped[Optional[str]] = mapped_column(String(64))
news: Mapped[Optional[list]] = mapped_column(JSON)
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, comment="主键ID(雪花)")
name: Mapped[Optional[str]] = mapped_column(String(255), comment="公司全称")
short_name: Mapped[str] = mapped_column(String(128), nullable=False, comment="公司简称")
logo_url: Mapped[Optional[str]] = mapped_column(String(512), comment="Logo地址")
region_code: Mapped[Optional[str]] = mapped_column(String(20), comment="地区编码")
company_type: Mapped[Optional[str]] = mapped_column(String(32), comment="企业类型:上市企业/独角兽/国企等")
industry_id: Mapped[Optional[int]] = mapped_column(BigInteger, comment="行业ID")
tags: Mapped[Optional[list]] = mapped_column(JSON, comment="公司标签JSON数组")
summary: Mapped[Optional[str]] = mapped_column(String(512), comment="一句话简介")
description: Mapped[Optional[str]] = mapped_column(Text, comment="公司详细描述")
founded_year: Mapped[Optional[str]] = mapped_column(String(10), comment="成立年份")
address: Mapped[Optional[str]] = mapped_column(String(255), comment="总部地址")
scale: Mapped[Optional[str]] = mapped_column(String(32), comment="企业规模")
website: Mapped[Optional[str]] = mapped_column(String(255), comment="官网地址")
financing_stage: Mapped[Optional[str]] = mapped_column(String(32), comment="融资状态")
latest_valuation: Mapped[Optional[str]] = mapped_column(String(64), comment="最新估值")
news: Mapped[Optional[list]] = mapped_column(JSON, comment="相关新闻JSON数组")
status: Mapped[int] = mapped_column(Integer, default=0, comment="0=待完善 1=已完善 2=禁用 3=补充中 4=补充失败")
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
update_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="创建时间")
update_time: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="更新时间")
+22 -21
View File
@@ -14,26 +14,27 @@ class Job(MysqlBase):
__tablename__ = "bg_job"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
title: Mapped[str] = mapped_column(String(255), nullable=False)
company_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
category_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
employment_type: Mapped[int] = mapped_column(Integer, default=0)
description: Mapped[Optional[str]] = mapped_column(Text)
requirement: Mapped[Optional[str]] = mapped_column(Text)
bonus: Mapped[Optional[str]] = mapped_column(Text)
tags: Mapped[Optional[list]] = mapped_column(JSON)
skill_tags: Mapped[Optional[list]] = mapped_column(JSON)
salary: Mapped[Optional[str]] = mapped_column(String(64))
education: Mapped[int] = mapped_column(Integer, default=0)
min_experience: Mapped[int] = mapped_column(Integer, default=0)
required_industry_id: Mapped[Optional[int]] = mapped_column(BigInteger)
required_major_ids: Mapped[Optional[list]] = mapped_column(JSON)
major_sensitivity: Mapped[Optional[int]] = mapped_column(Integer)
source_url: Mapped[Optional[str]] = mapped_column(String(1024))
source_id: Mapped[Optional[str]] = mapped_column(String(64))
recruit_category: Mapped[Optional[int]] = mapped_column(Integer, comment="招聘分类: 0=校招, 1=实习, 2=社招, 3=其他")
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, comment="主键ID(雪花)")
title: Mapped[str] = mapped_column(String(255), nullable=False, comment="岗位名称")
company_id: Mapped[int] = mapped_column(BigInteger, nullable=False, comment="关联公司ID")
category_id: Mapped[int] = mapped_column(BigInteger, nullable=False, comment="岗位类型ID")
employment_type: Mapped[int] = mapped_column(Integer, default=0, comment="0=全职 1=兼职")
description: Mapped[Optional[str]] = mapped_column(Text, comment="岗位职责")
requirement: Mapped[Optional[str]] = mapped_column(Text, comment="任职要求")
bonus: Mapped[Optional[str]] = mapped_column(Text, comment="加分项")
tags: Mapped[Optional[list]] = mapped_column(JSON, comment="岗位标签JSON数组")
skill_tags: Mapped[Optional[list]] = mapped_column(JSON, comment="技能标签JSON数组")
salary: Mapped[Optional[str]] = mapped_column(String(64), comment="薪资描述,如15-25K")
education: Mapped[int] = mapped_column(Integer, default=0, comment="学历要求 0=不限 1=大专 2=本科 3=硕士 4=博士")
min_experience: Mapped[int] = mapped_column(Integer, default=0, comment="最低工作年限,0=不要求")
required_industry_id: Mapped[Optional[int]] = mapped_column(BigInteger, comment="要求的行业经验ID")
required_major_ids: Mapped[Optional[list]] = mapped_column(JSON, comment="要求专业ID数组")
major_sensitivity: Mapped[Optional[int]] = mapped_column(Integer, comment="专业敏感度 0=不限 1=优先 2=强制")
source_url: Mapped[Optional[str]] = mapped_column(String(1024), comment="来源链接")
source_id: Mapped[Optional[str]] = mapped_column(String(64), comment="爬虫原始数据ID,用于去重")
content_hash: Mapped[Optional[str]] = mapped_column(String(64), comment="内容哈希,用于去重续命")
recruit_category: Mapped[Optional[int]] = mapped_column(Integer, comment="招聘分类: 0=校招 1=实习 2=社招 3=其他")
expire_at: Mapped[Optional[datetime]] = mapped_column(DateTime, comment="发布日期")
status: Mapped[int] = mapped_column(Integer, default=0, comment="0=上架 1=下架 2=已失效")
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
update_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="创建时间")
update_time: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="更新时间")
+8 -8
View File
@@ -13,10 +13,10 @@ class JobRegionRelation(MysqlBase):
__tablename__ = "bg_job_region_relation"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
job_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
region_code: Mapped[str] = mapped_column(String(20), nullable=False)
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, comment="主键ID(雪花)")
job_id: Mapped[int] = mapped_column(BigInteger, nullable=False, comment="岗位ID")
region_code: Mapped[str] = mapped_column(String(20), nullable=False, comment="地区编码")
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="创建时间")
class JobSkillTagRelation(MysqlBase):
@@ -24,7 +24,7 @@ class JobSkillTagRelation(MysqlBase):
__tablename__ = "bg_job_skill_tag_relation"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
job_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
skill_tag_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False)
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, comment="主键ID(雪花)")
job_id: Mapped[int] = mapped_column(BigInteger, nullable=False, comment="岗位ID")
skill_tag_id: Mapped[int] = mapped_column(BigInteger, nullable=False, comment="技能标签ID")
create_time: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="创建时间")
+2 -2
View File
@@ -11,5 +11,5 @@ class SkillTag(MysqlBase):
__tablename__ = "bg_skill_tag"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True)
name: Mapped[str] = mapped_column(String(100), unique=True, nullable=False)
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, comment="主键ID(雪花)")
name: Mapped[str] = mapped_column(String(100), unique=True, nullable=False, comment="标签名称(唯一索引)")
+18 -18
View File
@@ -3,7 +3,7 @@
from datetime import datetime
from typing import Optional
from sqlalchemy import BigInteger, DateTime, Integer, SmallInteger, String, Text
from sqlalchemy import BigInteger, DateTime, SmallInteger, String, Text
from sqlalchemy.orm import Mapped, mapped_column
from app.core.database import PgBase
@@ -14,22 +14,22 @@ class AppJobData(PgBase):
__tablename__ = "app_job_data"
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True)
id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True, comment="自增主键")
urllistid: Mapped[int] = mapped_column(BigInteger, nullable=False, comment="关联urllistid")
job_title: Mapped[Optional[str]] = mapped_column(String(255))
salary: Mapped[Optional[str]] = mapped_column(String(128))
location: Mapped[Optional[str]] = mapped_column(String(2048))
company: Mapped[Optional[str]] = mapped_column(String(255), comment="公司名")
experience: Mapped[Optional[str]] = mapped_column(String(64))
education: Mapped[Optional[str]] = mapped_column(String(64))
description: Mapped[str] = mapped_column(Text, nullable=False)
detail_url: Mapped[str] = mapped_column(String(1024), nullable=False)
recruit_category: Mapped[int] = mapped_column(SmallInteger, default=3, nullable=False, comment="招聘分类: 0=校招, 1=实习, 2=社招, 3=其他")
content_hash: Mapped[str] = mapped_column(String(64), nullable=False)
sources: Mapped[int] = mapped_column(SmallInteger, default=0, nullable=False)
job_title: Mapped[Optional[str]] = mapped_column(String(255), comment="职位名称")
salary: Mapped[Optional[str]] = mapped_column(String(128), comment="薪资")
location: Mapped[Optional[str]] = mapped_column(String(2048), comment="工作地点")
company: Mapped[Optional[str]] = mapped_column(String(255), comment="公司名")
experience: Mapped[Optional[str]] = mapped_column(String(64), comment="经验要求")
education: Mapped[Optional[str]] = mapped_column(String(64), comment="学历要求")
description: Mapped[str] = mapped_column(Text, nullable=False, comment="岗位详情(职责+要求+介绍)")
detail_url: Mapped[str] = mapped_column(String(1024), nullable=False, comment="详情页URL")
recruit_category: Mapped[int] = mapped_column(SmallInteger, default=3, nullable=False, comment="招聘分类: 0=校招 1=实习 2=社招 3=其他")
content_hash: Mapped[str] = mapped_column(String(64), nullable=False, comment="内容哈希,用于去重")
sources: Mapped[int] = mapped_column(SmallInteger, default=0, nullable=False, comment="数据来源 0=官网 1=平台")
expire_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="发布日期")
created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False)
clean_status: Mapped[str] = mapped_column(String(20), default="pending", nullable=False, comment="pending/cleaning/cleaned/discarded")
clean_started_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
cleaned_at: Mapped[Optional[datetime]] = mapped_column(DateTime)
created_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="创建时间")
updated_at: Mapped[datetime] = mapped_column(DateTime, nullable=False, comment="更新时间")
clean_status: Mapped[str] = mapped_column(String(20), default="pending", nullable=False, comment="清洗状态: pending/cleaning/cleaned/discarded")
clean_started_at: Mapped[Optional[datetime]] = mapped_column(DateTime, comment="清洗开始时间")
cleaned_at: Mapped[Optional[datetime]] = mapped_column(DateTime, comment="清洗完成时间")