import requests import json import time import math import hashlib from datetime import datetime CATEGORY_DB_MAP = { "1": 0, # 网站社会招聘 -> 数据库社招 "2": 1, # 网站校园招聘 -> 数据库校招 } def parse_to_db(records, task_crawl_id, company_id="btyy", company="倍特药业"): """ 将API返回的岗位数据清洗为 app_job_data 表所需格式 :param records: API返回的岗位列表 :param task_crawl_id: 爬虫任务ID :param company_id: 公司标识 :return: list[dict] """ results = [] for r in records: job_title = (r.get("JobAdName") or "").strip() if not job_title: continue duty = r.get("Duty") or "" require = r.get("Require") or "" parts = [] if duty and duty != "/": parts.append(f"【工作职责】\n{duty}") if require and require != "/": parts.append(f"【任职要求】\n{require}") description = "\n\n".join(parts) category_id = r.get("CategoryId", "1") job_id = r.get("Id", "") prefix = "social" if category_id == "1" else "campus" detail_url = f"https://btyy.zhiye.com/{prefix}/jobs/{job_id}" content_hash = hashlib.md5( f"{job_title}|{company_id}|{description}".encode("utf-8") ).hexdigest() item = { "task_crawl_id": task_crawl_id, "job_title": job_title, "company_id": company_id, "company": company, "detail_url": detail_url, "recruit_category": CATEGORY_DB_MAP.get(category_id, 0), "content_hash": content_hash, } # 可选字段,有值才设置 loc_names = r.get("LocNames") if loc_names: item["location"] = ",".join(loc_names) if r.get("Salary"): item["salary"] = r["Salary"] if r.get("Degree"): item["education"] = r["Degree"] if r.get("YearsOfWorking"): item["experience"] = r["YearsOfWorking"] if description: item["description"] = description post_date = r.get("PostDate") or "" if post_date: item["expire_at"] = post_date[:10] + " 00:00:00" else: item["expire_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") results.append(item) return results class BtyyJobCrawler: """倍特药业招聘官网爬虫 (btyy.zhiye.com)""" BASE_URL = "https://btyy.zhiye.com/api/Jobad" CATEGORY_MAP = { "shezhao": "1", "xiaozhao": "2", } def __init__(self): self.session = requests.Session() self.session.headers.update({ "Content-Type": "application/json;charset=UTF-8", "Accept": "application/json, text/plain, */*", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36", "Referer": "https://btyy.zhiye.com/", }) def _get_job_list_page(self, category_id, page_index=0, page_size=20): """获取单页岗位列表""" url = f"{self.BASE_URL}/GetJobAdPageList" payload = { "PageIndex": page_index, "PageSize": page_size, "Category": [category_id], "KeyWords": "", "SpecialType": 0, "PortalId": "", "DisplayFields": [ "Category", "LocId", "HeadCount", "PostDate", "ClassificationTwo", "WorkWeChatQrCode", "Degree", "Kind", "Org" ], } resp = self.session.post(url, json=payload) resp.raise_for_status() data = resp.json() if data.get("Code") != 200: raise Exception(f"API错误: {data.get('Message', '未知错误')}") return data def _get_all_jobs(self, category_id): """获取某个分类下的所有岗位(自动分页)""" first_page = self._get_job_list_page(category_id, page_index=0) total = first_page["Count"] all_records = first_page["Data"] if total == 0: return {"total": 0, "records": [], "position_ids": []} total_pages = math.ceil(total / 20) for page in range(1, total_pages): time.sleep(0.3) page_data = self._get_job_list_page(category_id, page_index=page) all_records.extend(page_data["Data"]) position_ids = [r["Id"] for r in all_records] return {"total": total, "records": all_records, "position_ids": position_ids} def get_shezhao_list(self): """获取社会招聘列表 返回: {"total": int, "records": list, "position_ids": list} """ return self._get_all_jobs(self.CATEGORY_MAP["shezhao"]) def get_xiaozhao_list(self): """获取校园招聘列表 返回: {"total": int, "records": list, "position_ids": list} """ return self._get_all_jobs(self.CATEGORY_MAP["xiaozhao"]) def get_shixi_list(self): """获取实习招聘列表(该网站无实习分类,返回空)""" return {"total": 0, "records": [], "position_ids": []} def get_position_detail(self, position_id): """获取岗位详情 注:该网站列表接口已返回完整岗位信息(Duty、Require), 此方法从已获取的列表数据中提取,无需额外请求。 如需单独请求,可访问岗位页面。 """ for category_id in self.CATEGORY_MAP.values(): data = self._get_job_list_page(category_id, page_index=0, page_size=100) for record in data.get("Data", []): if record["Id"] == position_id: return record return None if __name__ == "__main__": crawler = BtyyJobCrawler() print("=" * 60) print("倍特药业 - 招聘岗位爬取") print("=" * 60) # 社会招聘 print("\n[社会招聘]") shezhao = crawler.get_shezhao_list() print(f" 共 {shezhao['total']} 个岗位") # 校园招聘 print("\n[校园招聘]") xiaozhao = crawler.get_xiaozhao_list() print(f" 共 {xiaozhao['total']} 个岗位") # 数据清洗 print("\n[数据清洗]") task_crawl_id = 0 all_parsed = parse_to_db(shezhao["records"], task_crawl_id) all_parsed += parse_to_db(xiaozhao["records"], task_crawl_id) print(f" 清洗完成: {len(all_parsed)} 条") # 打印样例 print("\n--- 样例 ---") for k, v in all_parsed[0].items(): print(f" {k}: {str(v)[:100]}") # 保存 output_file = "crawl/btyy/btyy_parsed.json" with open(output_file, "w", encoding="utf-8") as f: json.dump(all_parsed, f, ensure_ascii=False, indent=2) print(f"\n已保存到 {output_file}")