generated from kgod/ai-review-template
feat: add crawl scripts for recruitment websites
- btyy (倍特药业), fullsemi (富芯半导体): 北森平台爬虫 - hotjob (中国五矿): hotjob平台爬虫 - leinao (中科类脑): 静态HTML爬虫 - task_fetcher: 原子锁获取任务 - post.md: 抓取技能文档 - export_har: mitmproxy HAR导出工具
This commit is contained in:
@@ -0,0 +1,207 @@
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
import math
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
CATEGORY_DB_MAP = {
|
||||
"1": 0, # 网站社会招聘 -> 数据库社招
|
||||
"2": 1, # 网站校园招聘 -> 数据库校招
|
||||
}
|
||||
|
||||
|
||||
def parse_to_db(records, task_crawl_id, company_id="btyy", company="倍特药业"):
|
||||
"""
|
||||
将API返回的岗位数据清洗为 app_job_data 表所需格式
|
||||
|
||||
:param records: API返回的岗位列表
|
||||
:param task_crawl_id: 爬虫任务ID
|
||||
:param company_id: 公司标识
|
||||
:return: list[dict]
|
||||
"""
|
||||
results = []
|
||||
for r in records:
|
||||
job_title = (r.get("JobAdName") or "").strip()
|
||||
if not job_title:
|
||||
continue
|
||||
|
||||
duty = r.get("Duty") or ""
|
||||
require = r.get("Require") or ""
|
||||
parts = []
|
||||
if duty and duty != "/":
|
||||
parts.append(f"【工作职责】\n{duty}")
|
||||
if require and require != "/":
|
||||
parts.append(f"【任职要求】\n{require}")
|
||||
description = "\n\n".join(parts)
|
||||
|
||||
category_id = r.get("CategoryId", "1")
|
||||
job_id = r.get("Id", "")
|
||||
prefix = "social" if category_id == "1" else "campus"
|
||||
detail_url = f"https://btyy.zhiye.com/{prefix}/jobs/{job_id}"
|
||||
|
||||
content_hash = hashlib.md5(
|
||||
f"{job_title}|{company_id}|{description}".encode("utf-8")
|
||||
).hexdigest()
|
||||
|
||||
item = {
|
||||
"task_crawl_id": task_crawl_id,
|
||||
"job_title": job_title,
|
||||
"company_id": company_id,
|
||||
"company": company,
|
||||
"detail_url": detail_url,
|
||||
"recruit_category": CATEGORY_DB_MAP.get(category_id, 0),
|
||||
"content_hash": content_hash,
|
||||
}
|
||||
|
||||
# 可选字段,有值才设置
|
||||
loc_names = r.get("LocNames")
|
||||
if loc_names:
|
||||
item["location"] = ",".join(loc_names)
|
||||
|
||||
if r.get("Salary"):
|
||||
item["salary"] = r["Salary"]
|
||||
|
||||
if r.get("Degree"):
|
||||
item["education"] = r["Degree"]
|
||||
|
||||
if r.get("YearsOfWorking"):
|
||||
item["experience"] = r["YearsOfWorking"]
|
||||
|
||||
if description:
|
||||
item["description"] = description
|
||||
|
||||
post_date = r.get("PostDate") or ""
|
||||
if post_date:
|
||||
item["expire_at"] = post_date[:10] + " 00:00:00"
|
||||
else:
|
||||
item["expire_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
results.append(item)
|
||||
return results
|
||||
|
||||
|
||||
class BtyyJobCrawler:
|
||||
"""倍特药业招聘官网爬虫 (btyy.zhiye.com)"""
|
||||
|
||||
BASE_URL = "https://btyy.zhiye.com/api/Jobad"
|
||||
CATEGORY_MAP = {
|
||||
"shezhao": "1",
|
||||
"xiaozhao": "2",
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"Content-Type": "application/json;charset=UTF-8",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36",
|
||||
"Referer": "https://btyy.zhiye.com/",
|
||||
})
|
||||
|
||||
def _get_job_list_page(self, category_id, page_index=0, page_size=20):
|
||||
"""获取单页岗位列表"""
|
||||
url = f"{self.BASE_URL}/GetJobAdPageList"
|
||||
payload = {
|
||||
"PageIndex": page_index,
|
||||
"PageSize": page_size,
|
||||
"Category": [category_id],
|
||||
"KeyWords": "",
|
||||
"SpecialType": 0,
|
||||
"PortalId": "",
|
||||
"DisplayFields": [
|
||||
"Category", "LocId", "HeadCount", "PostDate",
|
||||
"ClassificationTwo", "WorkWeChatQrCode", "Degree",
|
||||
"Kind", "Org"
|
||||
],
|
||||
}
|
||||
resp = self.session.post(url, json=payload)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
if data.get("Code") != 200:
|
||||
raise Exception(f"API错误: {data.get('Message', '未知错误')}")
|
||||
return data
|
||||
|
||||
def _get_all_jobs(self, category_id):
|
||||
"""获取某个分类下的所有岗位(自动分页)"""
|
||||
first_page = self._get_job_list_page(category_id, page_index=0)
|
||||
total = first_page["Count"]
|
||||
all_records = first_page["Data"]
|
||||
|
||||
if total == 0:
|
||||
return {"total": 0, "records": [], "position_ids": []}
|
||||
|
||||
total_pages = math.ceil(total / 20)
|
||||
for page in range(1, total_pages):
|
||||
time.sleep(0.3)
|
||||
page_data = self._get_job_list_page(category_id, page_index=page)
|
||||
all_records.extend(page_data["Data"])
|
||||
|
||||
position_ids = [r["Id"] for r in all_records]
|
||||
return {"total": total, "records": all_records, "position_ids": position_ids}
|
||||
|
||||
def get_shezhao_list(self):
|
||||
"""获取社会招聘列表
|
||||
返回: {"total": int, "records": list, "position_ids": list}
|
||||
"""
|
||||
return self._get_all_jobs(self.CATEGORY_MAP["shezhao"])
|
||||
|
||||
def get_xiaozhao_list(self):
|
||||
"""获取校园招聘列表
|
||||
返回: {"total": int, "records": list, "position_ids": list}
|
||||
"""
|
||||
return self._get_all_jobs(self.CATEGORY_MAP["xiaozhao"])
|
||||
|
||||
def get_shixi_list(self):
|
||||
"""获取实习招聘列表(该网站无实习分类,返回空)"""
|
||||
return {"total": 0, "records": [], "position_ids": []}
|
||||
|
||||
def get_position_detail(self, position_id):
|
||||
"""获取岗位详情
|
||||
注:该网站列表接口已返回完整岗位信息(Duty、Require),
|
||||
此方法从已获取的列表数据中提取,无需额外请求。
|
||||
如需单独请求,可访问岗位页面。
|
||||
"""
|
||||
for category_id in self.CATEGORY_MAP.values():
|
||||
data = self._get_job_list_page(category_id, page_index=0, page_size=100)
|
||||
for record in data.get("Data", []):
|
||||
if record["Id"] == position_id:
|
||||
return record
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
crawler = BtyyJobCrawler()
|
||||
|
||||
print("=" * 60)
|
||||
print("倍特药业 - 招聘岗位爬取")
|
||||
print("=" * 60)
|
||||
|
||||
# 社会招聘
|
||||
print("\n[社会招聘]")
|
||||
shezhao = crawler.get_shezhao_list()
|
||||
print(f" 共 {shezhao['total']} 个岗位")
|
||||
|
||||
# 校园招聘
|
||||
print("\n[校园招聘]")
|
||||
xiaozhao = crawler.get_xiaozhao_list()
|
||||
print(f" 共 {xiaozhao['total']} 个岗位")
|
||||
|
||||
# 数据清洗
|
||||
print("\n[数据清洗]")
|
||||
task_crawl_id = 0
|
||||
all_parsed = parse_to_db(shezhao["records"], task_crawl_id)
|
||||
all_parsed += parse_to_db(xiaozhao["records"], task_crawl_id)
|
||||
print(f" 清洗完成: {len(all_parsed)} 条")
|
||||
|
||||
# 打印样例
|
||||
print("\n--- 样例 ---")
|
||||
for k, v in all_parsed[0].items():
|
||||
print(f" {k}: {str(v)[:100]}")
|
||||
|
||||
# 保存
|
||||
output_file = "crawl/btyy/btyy_parsed.json"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(all_parsed, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n已保存到 {output_file}")
|
||||
Reference in New Issue
Block a user