feat: add crawl scripts for recruitment websites

- btyy (倍特药业), fullsemi (富芯半导体): 北森平台爬虫 - hotjob (中国五矿): hotjob平台爬虫 - leinao (中科类脑): 静态HTML爬虫 - task_fetcher: 原子锁获取任务 - post.md: 抓取技能文档 - export_har: mitmproxy HAR导出工具
2026-05-27 23:48:30 +08:00
parent 2e9efce291
commit c06f595559
7 changed files with 1324 additions and 0 deletions
@@ -0,0 +1,207 @@
+import requests
+import json
+import time
+import math
+import hashlib
+from datetime import datetime
+
+
+CATEGORY_DB_MAP = {
+    "1": 0,  # 网站社会招聘 -> 数据库社招
+    "2": 1,  # 网站校园招聘 -> 数据库校招
+}
+
+
+def parse_to_db(records, task_crawl_id, company_id="btyy", company="倍特药业"):
+    """
+    将API返回的岗位数据清洗为 app_job_data 表所需格式
+
+    :param records: API返回的岗位列表
+    :param task_crawl_id: 爬虫任务ID
+    :param company_id: 公司标识
+    :return: list[dict]
+    """
+    results = []
+    for r in records:
+        job_title = (r.get("JobAdName") or "").strip()
+        if not job_title:
+            continue
+
+        duty = r.get("Duty") or ""
+        require = r.get("Require") or ""
+        parts = []
+        if duty and duty != "/":
+            parts.append(f"【工作职责】\n{duty}")
+        if require and require != "/":
+            parts.append(f"【任职要求】\n{require}")
+        description = "\n\n".join(parts)
+
+        category_id = r.get("CategoryId", "1")
+        job_id = r.get("Id", "")
+        prefix = "social" if category_id == "1" else "campus"
+        detail_url = f"https://btyy.zhiye.com/{prefix}/jobs/{job_id}"
+
+        content_hash = hashlib.md5(
+            f"{job_title}|{company_id}|{description}".encode("utf-8")
+        ).hexdigest()
+
+        item = {
+            "task_crawl_id": task_crawl_id,
+            "job_title": job_title,
+            "company_id": company_id,
+            "company": company,
+            "detail_url": detail_url,
+            "recruit_category": CATEGORY_DB_MAP.get(category_id, 0),
+            "content_hash": content_hash,
+        }
+
+        # 可选字段，有值才设置
+        loc_names = r.get("LocNames")
+        if loc_names:
+            item["location"] = ",".join(loc_names)
+
+        if r.get("Salary"):
+            item["salary"] = r["Salary"]
+
+        if r.get("Degree"):
+            item["education"] = r["Degree"]
+
+        if r.get("YearsOfWorking"):
+            item["experience"] = r["YearsOfWorking"]
+
+        if description:
+            item["description"] = description
+
+        post_date = r.get("PostDate") or ""
+        if post_date:
+            item["expire_at"] = post_date[:10] + " 00:00:00"
+        else:
+            item["expire_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+        results.append(item)
+    return results
+
+
+class BtyyJobCrawler:
+    """倍特药业招聘官网爬虫 (btyy.zhiye.com)"""
+
+    BASE_URL = "https://btyy.zhiye.com/api/Jobad"
+    CATEGORY_MAP = {
+        "shezhao": "1",
+        "xiaozhao": "2",
+    }
+
+    def __init__(self):
+        self.session = requests.Session()
+        self.session.headers.update({
+            "Content-Type": "application/json;charset=UTF-8",
+            "Accept": "application/json, text/plain, */*",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36",
+            "Referer": "https://btyy.zhiye.com/",
+        })
+
+    def _get_job_list_page(self, category_id, page_index=0, page_size=20):
+        """获取单页岗位列表"""
+        url = f"{self.BASE_URL}/GetJobAdPageList"
+        payload = {
+            "PageIndex": page_index,
+            "PageSize": page_size,
+            "Category": [category_id],
+            "KeyWords": "",
+            "SpecialType": 0,
+            "PortalId": "",
+            "DisplayFields": [
+                "Category", "LocId", "HeadCount", "PostDate",
+                "ClassificationTwo", "WorkWeChatQrCode", "Degree",
+                "Kind", "Org"
+            ],
+        }
+        resp = self.session.post(url, json=payload)
+        resp.raise_for_status()
+        data = resp.json()
+        if data.get("Code") != 200:
+            raise Exception(f"API错误: {data.get('Message', '未知错误')}")
+        return data
+
+    def _get_all_jobs(self, category_id):
+        """获取某个分类下的所有岗位（自动分页）"""
+        first_page = self._get_job_list_page(category_id, page_index=0)
+        total = first_page["Count"]
+        all_records = first_page["Data"]
+
+        if total == 0:
+            return {"total": 0, "records": [], "position_ids": []}
+
+        total_pages = math.ceil(total / 20)
+        for page in range(1, total_pages):
+            time.sleep(0.3)
+            page_data = self._get_job_list_page(category_id, page_index=page)
+            all_records.extend(page_data["Data"])
+
+        position_ids = [r["Id"] for r in all_records]
+        return {"total": total, "records": all_records, "position_ids": position_ids}
+
+    def get_shezhao_list(self):
+        """获取社会招聘列表
+        返回: {"total": int, "records": list, "position_ids": list}
+        """
+        return self._get_all_jobs(self.CATEGORY_MAP["shezhao"])
+
+    def get_xiaozhao_list(self):
+        """获取校园招聘列表
+        返回: {"total": int, "records": list, "position_ids": list}
+        """
+        return self._get_all_jobs(self.CATEGORY_MAP["xiaozhao"])
+
+    def get_shixi_list(self):
+        """获取实习招聘列表（该网站无实习分类，返回空）"""
+        return {"total": 0, "records": [], "position_ids": []}
+
+    def get_position_detail(self, position_id):
+        """获取岗位详情
+        注：该网站列表接口已返回完整岗位信息（Duty、Require），
+        此方法从已获取的列表数据中提取，无需额外请求。
+        如需单独请求，可访问岗位页面。
+        """
+        for category_id in self.CATEGORY_MAP.values():
+            data = self._get_job_list_page(category_id, page_index=0, page_size=100)
+            for record in data.get("Data", []):
+                if record["Id"] == position_id:
+                    return record
+        return None
+
+
+if __name__ == "__main__":
+    crawler = BtyyJobCrawler()
+
+    print("=" * 60)
+    print("倍特药业 - 招聘岗位爬取")
+    print("=" * 60)
+
+    # 社会招聘
+    print("\n[社会招聘]")
+    shezhao = crawler.get_shezhao_list()
+    print(f"  共 {shezhao['total']} 个岗位")
+
+    # 校园招聘
+    print("\n[校园招聘]")
+    xiaozhao = crawler.get_xiaozhao_list()
+    print(f"  共 {xiaozhao['total']} 个岗位")
+
+    # 数据清洗
+    print("\n[数据清洗]")
+    task_crawl_id = 0
+    all_parsed = parse_to_db(shezhao["records"], task_crawl_id)
+    all_parsed += parse_to_db(xiaozhao["records"], task_crawl_id)
+    print(f"  清洗完成: {len(all_parsed)} 条")
+
+    # 打印样例
+    print("\n--- 样例 ---")
+    for k, v in all_parsed[0].items():
+        print(f"  {k}: {str(v)[:100]}")
+
+    # 保存
+    output_file = "crawl/btyy/btyy_parsed.json"
+    with open(output_file, "w", encoding="utf-8") as f:
+        json.dump(all_parsed, f, ensure_ascii=False, indent=2)
+    print(f"\n已保存到 {output_file}")