import requests import hashlib import time import math from datetime import datetime CATEGORY_DB_MAP = { "1": 1, # recruitType=1 校招 -> 数据库1 "2": 0, # recruitType=2 社招 -> 数据库0 "12": 2, # recruitType=12 实习 -> 数据库2 } def parse_to_db(records, task_crawl_id, company_id="minmetals", company="中国五矿"): """将API返回的岗位数据清洗为 app_job_data 表所需格式""" results = [] for r in records: job_title = (r.get("postName") or "").strip() if not job_title: continue work_content = r.get("workContent") or "" service_condition = r.get("serviceCondition") or "" subject = r.get("subject") or "" parts = [] if work_content: parts.append(f"【工作职责】\n{work_content}") if service_condition: parts.append(f"【任职要求】\n{service_condition}") if subject and not service_condition: parts.append(f"【专业要求】\n{subject}") description = "\n\n".join(parts) recruit_type = str(r.get("recruitType", "1")) post_id = r.get("postId", "") detail_url = f"https://wecruit.hotjob.cn/SU62f3786ebef57c29ead8adba/mc/detail?postId={post_id}&recruitType={'campus' if recruit_type == '1' else 'social'}" content_hash = hashlib.md5( f"{job_title}|{company_id}|{description}".encode("utf-8") ).hexdigest() item = { "task_crawl_id": task_crawl_id, "job_title": job_title, "company_id": company_id, "company": r.get("company") or company, "detail_url": detail_url, "recruit_category": CATEGORY_DB_MAP.get(recruit_type, 0), "content_hash": content_hash, } if r.get("workPlaceStr") and r["workPlaceStr"] != "全部地区": item["location"] = r["workPlaceStr"] if r.get("educationStr"): item["education"] = r["educationStr"] if r.get("workYears") and r["workYears"] != "无经验": item["experience"] = r["workYears"] if description: item["description"] = description publish_date = r.get("publishDate") or "" if publish_date: item["expire_at"] = publish_date[:19] else: item["expire_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") results.append(item) return results class HotjobCrawler: """中国五矿招聘官网爬虫 (wecruit.hotjob.cn)""" BASE_URL = "https://wecruit.hotjob.cn/wecruit/positionInfo" SUITE_KEY = "SU62f3786ebef57c29ead8adba" CATEGORY_MAP = { "xiaozhao": "1", "shezhao": "2", "shixi": "12", } def __init__(self): self.session = requests.Session() self.session.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36", "Referer": f"https://wecruit.hotjob.cn/{self.SUITE_KEY}/mc/position/campus", "Content-Type": "application/x-www-form-urlencoded", }) def _get_list_page(self, recruit_type, page=1, page_size=10): url = f"{self.BASE_URL}/listPosition/{self.SUITE_KEY}?iSaJAx=isAjax&request_locale=zh_CN" data = f"recruitType={recruit_type}¤tPage={page}&pageSize={page_size}&coordinateLat=&coordinateLng=&orgCode=0" resp = self.session.post(url, data=data) resp.raise_for_status() result = resp.json() if result.get("state") != "200": raise Exception(f"API错误: {result}") return result["data"] def _get_all_jobs(self, recruit_type): first = self._get_list_page(recruit_type, page=1) total = first["positonNum"] all_records = first["pageForm"]["pageData"] if total == 0: return {"total": 0, "records": [], "position_ids": []} total_pages = first["pageForm"]["totalPage"] for page in range(2, total_pages + 1): time.sleep(0.3) page_data = self._get_list_page(recruit_type, page=page) all_records.extend(page_data["pageForm"]["pageData"]) position_ids = [r["postId"] for r in all_records] return {"total": total, "records": all_records, "position_ids": position_ids} def get_xiaozhao_list(self): """获取校园招聘列表""" return self._get_all_jobs(self.CATEGORY_MAP["xiaozhao"]) def get_shezhao_list(self): """获取社会招聘列表""" return self._get_all_jobs(self.CATEGORY_MAP["shezhao"]) def get_shixi_list(self): """获取实习招聘列表""" return self._get_all_jobs(self.CATEGORY_MAP["shixi"]) def get_position_detail(self, post_id, recruit_type="1"): """获取岗位详情""" url = f"{self.BASE_URL}/listPositionDetail/{self.SUITE_KEY}?iSaJAx=isAjax&request_locale=zh_CN" data = f"postId={post_id}&recruitType={recruit_type}" resp = self.session.post(url, data=data) resp.raise_for_status() result = resp.json() if result.get("state") != "200": raise Exception(f"API错误: {result}") return result["data"] if __name__ == "__main__": import sys sys.stdout.reconfigure(encoding='utf-8') crawler = HotjobCrawler() print("=" * 60) print("中国五矿 - 招聘岗位爬取") print("=" * 60) print("\n[校园招聘]") xiaozhao = crawler.get_xiaozhao_list() print(f" 共 {xiaozhao['total']} 个岗位") print("\n[社会招聘]") shezhao = crawler.get_shezhao_list() print(f" 共 {shezhao['total']} 个岗位") print("\n[实习招聘]") shixi = crawler.get_shixi_list() print(f" 共 {shixi['total']} 个岗位") # 获取前5个校招详情测试 print("\n[获取详情测试 - 校招前5个]") details = [] for r in xiaozhao["records"][:5]: time.sleep(0.3) detail = crawler.get_position_detail(r["postId"], "1") details.append(detail) print(f" {detail['postName']} | {detail.get('workPlaceStr','')}") # 清洗测试 print("\n[数据清洗]") parsed = parse_to_db(details, 0) print(f" 清洗完成: {len(parsed)} 条") if parsed: print("\n--- 样例 ---") for k, v in parsed[0].items(): print(f" {k}: {str(v)[:100]}")