Files
post_crawler/crawl/hotjob/hotjob_crawler.py
T
kgod c06f595559 feat: add crawl scripts for recruitment websites
- btyy (倍特药业), fullsemi (富芯半导体): 北森平台爬虫
- hotjob (中国五矿): hotjob平台爬虫
- leinao (中科类脑): 静态HTML爬虫
- task_fetcher: 原子锁获取任务
- post.md: 抓取技能文档
- export_har: mitmproxy HAR导出工具
2026-05-27 23:48:30 +08:00

181 lines
6.3 KiB
Python

import requests
import hashlib
import time
import math
from datetime import datetime
CATEGORY_DB_MAP = {
"1": 1, # recruitType=1 校招 -> 数据库1
"2": 0, # recruitType=2 社招 -> 数据库0
"12": 2, # recruitType=12 实习 -> 数据库2
}
def parse_to_db(records, task_crawl_id, company_id="minmetals", company="中国五矿"):
"""将API返回的岗位数据清洗为 app_job_data 表所需格式"""
results = []
for r in records:
job_title = (r.get("postName") or "").strip()
if not job_title:
continue
work_content = r.get("workContent") or ""
service_condition = r.get("serviceCondition") or ""
subject = r.get("subject") or ""
parts = []
if work_content:
parts.append(f"【工作职责】\n{work_content}")
if service_condition:
parts.append(f"【任职要求】\n{service_condition}")
if subject and not service_condition:
parts.append(f"【专业要求】\n{subject}")
description = "\n\n".join(parts)
recruit_type = str(r.get("recruitType", "1"))
post_id = r.get("postId", "")
detail_url = f"https://wecruit.hotjob.cn/SU62f3786ebef57c29ead8adba/mc/detail?postId={post_id}&recruitType={'campus' if recruit_type == '1' else 'social'}"
content_hash = hashlib.md5(
f"{job_title}|{company_id}|{description}".encode("utf-8")
).hexdigest()
item = {
"task_crawl_id": task_crawl_id,
"job_title": job_title,
"company_id": company_id,
"company": r.get("company") or company,
"detail_url": detail_url,
"recruit_category": CATEGORY_DB_MAP.get(recruit_type, 0),
"content_hash": content_hash,
}
if r.get("workPlaceStr") and r["workPlaceStr"] != "全部地区":
item["location"] = r["workPlaceStr"]
if r.get("educationStr"):
item["education"] = r["educationStr"]
if r.get("workYears") and r["workYears"] != "无经验":
item["experience"] = r["workYears"]
if description:
item["description"] = description
publish_date = r.get("publishDate") or ""
if publish_date:
item["expire_at"] = publish_date[:19]
else:
item["expire_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
results.append(item)
return results
class HotjobCrawler:
"""中国五矿招聘官网爬虫 (wecruit.hotjob.cn)"""
BASE_URL = "https://wecruit.hotjob.cn/wecruit/positionInfo"
SUITE_KEY = "SU62f3786ebef57c29ead8adba"
CATEGORY_MAP = {
"xiaozhao": "1",
"shezhao": "2",
"shixi": "12",
}
def __init__(self):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36",
"Referer": f"https://wecruit.hotjob.cn/{self.SUITE_KEY}/mc/position/campus",
"Content-Type": "application/x-www-form-urlencoded",
})
def _get_list_page(self, recruit_type, page=1, page_size=10):
url = f"{self.BASE_URL}/listPosition/{self.SUITE_KEY}?iSaJAx=isAjax&request_locale=zh_CN"
data = f"recruitType={recruit_type}&currentPage={page}&pageSize={page_size}&coordinateLat=&coordinateLng=&orgCode=0"
resp = self.session.post(url, data=data)
resp.raise_for_status()
result = resp.json()
if result.get("state") != "200":
raise Exception(f"API错误: {result}")
return result["data"]
def _get_all_jobs(self, recruit_type):
first = self._get_list_page(recruit_type, page=1)
total = first["positonNum"]
all_records = first["pageForm"]["pageData"]
if total == 0:
return {"total": 0, "records": [], "position_ids": []}
total_pages = first["pageForm"]["totalPage"]
for page in range(2, total_pages + 1):
time.sleep(0.3)
page_data = self._get_list_page(recruit_type, page=page)
all_records.extend(page_data["pageForm"]["pageData"])
position_ids = [r["postId"] for r in all_records]
return {"total": total, "records": all_records, "position_ids": position_ids}
def get_xiaozhao_list(self):
"""获取校园招聘列表"""
return self._get_all_jobs(self.CATEGORY_MAP["xiaozhao"])
def get_shezhao_list(self):
"""获取社会招聘列表"""
return self._get_all_jobs(self.CATEGORY_MAP["shezhao"])
def get_shixi_list(self):
"""获取实习招聘列表"""
return self._get_all_jobs(self.CATEGORY_MAP["shixi"])
def get_position_detail(self, post_id, recruit_type="1"):
"""获取岗位详情"""
url = f"{self.BASE_URL}/listPositionDetail/{self.SUITE_KEY}?iSaJAx=isAjax&request_locale=zh_CN"
data = f"postId={post_id}&recruitType={recruit_type}"
resp = self.session.post(url, data=data)
resp.raise_for_status()
result = resp.json()
if result.get("state") != "200":
raise Exception(f"API错误: {result}")
return result["data"]
if __name__ == "__main__":
import sys
sys.stdout.reconfigure(encoding='utf-8')
crawler = HotjobCrawler()
print("=" * 60)
print("中国五矿 - 招聘岗位爬取")
print("=" * 60)
print("\n[校园招聘]")
xiaozhao = crawler.get_xiaozhao_list()
print(f"{xiaozhao['total']} 个岗位")
print("\n[社会招聘]")
shezhao = crawler.get_shezhao_list()
print(f"{shezhao['total']} 个岗位")
print("\n[实习招聘]")
shixi = crawler.get_shixi_list()
print(f"{shixi['total']} 个岗位")
# 获取前5个校招详情测试
print("\n[获取详情测试 - 校招前5个]")
details = []
for r in xiaozhao["records"][:5]:
time.sleep(0.3)
detail = crawler.get_position_detail(r["postId"], "1")
details.append(detail)
print(f" {detail['postName']} | {detail.get('workPlaceStr','')}")
# 清洗测试
print("\n[数据清洗]")
parsed = parse_to_db(details, 0)
print(f" 清洗完成: {len(parsed)}")
if parsed:
print("\n--- 样例 ---")
for k, v in parsed[0].items():
print(f" {k}: {str(v)[:100]}")