generated from kgod/ai-review-template
c06f595559
- btyy (倍特药业), fullsemi (富芯半导体): 北森平台爬虫 - hotjob (中国五矿): hotjob平台爬虫 - leinao (中科类脑): 静态HTML爬虫 - task_fetcher: 原子锁获取任务 - post.md: 抓取技能文档 - export_har: mitmproxy HAR导出工具
217 lines
7.1 KiB
Python
217 lines
7.1 KiB
Python
import requests
|
|
import hashlib
|
|
import time
|
|
from datetime import datetime
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
CATEGORY_DB_MAP = {
|
|
"7": 0, # /job/7 社招
|
|
"8": 1, # /job/8 校招
|
|
}
|
|
|
|
|
|
def parse_to_db(records, task_crawl_id, company_id="leinao", company="中科类脑"):
|
|
"""将解析后的岗位数据清洗为 app_job_data 表所需格式"""
|
|
results = []
|
|
for r in records:
|
|
job_title = (r.get("job_title") or "").strip()
|
|
if not job_title:
|
|
continue
|
|
|
|
description = r.get("description") or ""
|
|
detail_url = r.get("detail_url") or ""
|
|
recruit_category = r.get("recruit_category", 0)
|
|
|
|
content_hash = hashlib.md5(
|
|
f"{job_title}|{company_id}|{description}".encode("utf-8")
|
|
).hexdigest()
|
|
|
|
item = {
|
|
"task_crawl_id": task_crawl_id,
|
|
"job_title": job_title,
|
|
"company_id": company_id,
|
|
"company": company,
|
|
"detail_url": detail_url,
|
|
"recruit_category": recruit_category,
|
|
"content_hash": content_hash,
|
|
}
|
|
|
|
if r.get("location"):
|
|
item["location"] = r["location"]
|
|
if r.get("salary"):
|
|
item["salary"] = r["salary"]
|
|
if r.get("education"):
|
|
item["education"] = r["education"]
|
|
if r.get("experience"):
|
|
item["experience"] = r["experience"]
|
|
if description:
|
|
item["description"] = description
|
|
|
|
post_date = r.get("post_date") or ""
|
|
if post_date:
|
|
item["expire_at"] = post_date + " 00:00:00"
|
|
else:
|
|
item["expire_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
results.append(item)
|
|
return results
|
|
|
|
|
|
class LeinaoJobCrawler:
|
|
"""中科类脑招聘官网爬虫 (www.leinao.ai)"""
|
|
|
|
BASE_URL = "https://www.leinao.ai"
|
|
CATEGORY_MAP = {
|
|
"shezhao": "7",
|
|
"xiaozhao": "8",
|
|
}
|
|
|
|
def __init__(self):
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36",
|
|
})
|
|
|
|
def _get_job_list(self, category_id):
|
|
"""获取岗位列表页,解析HTML"""
|
|
url = f"{self.BASE_URL}/job/{category_id}"
|
|
resp = self.session.get(url)
|
|
resp.raise_for_status()
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
links = soup.find_all("a", href=lambda h: h and "/jobdetail/" in h)
|
|
|
|
records = []
|
|
for link in links:
|
|
href = link.get("href", "")
|
|
job_id = href.split("/")[-1]
|
|
cells = link.find_all(["div", "span", "p", "generic"])
|
|
texts = [t.get_text(strip=True) for t in link.children if hasattr(t, 'get_text')]
|
|
all_text = link.get_text(separator="|", strip=True).split("|")
|
|
all_text = [t for t in all_text if t]
|
|
|
|
record = {
|
|
"job_id": job_id,
|
|
"detail_url": f"{self.BASE_URL}{href}",
|
|
"recruit_category": CATEGORY_DB_MAP.get(category_id, 0),
|
|
}
|
|
|
|
if len(all_text) >= 1:
|
|
record["job_title"] = all_text[0]
|
|
if len(all_text) >= 2:
|
|
loc = all_text[1]
|
|
record["location"] = loc if loc != "不限" else None
|
|
if len(all_text) >= 3:
|
|
record["job_type"] = all_text[2]
|
|
if len(all_text) >= 4:
|
|
record["category_name"] = all_text[3]
|
|
if len(all_text) >= 5:
|
|
record["post_date"] = all_text[4]
|
|
if len(all_text) >= 6:
|
|
record["org"] = all_text[5]
|
|
|
|
records.append(record)
|
|
|
|
position_ids = [r["job_id"] for r in records]
|
|
return {"total": len(records), "records": records, "position_ids": position_ids}
|
|
|
|
def get_shezhao_list(self):
|
|
"""获取社会招聘列表"""
|
|
return self._get_job_list(self.CATEGORY_MAP["shezhao"])
|
|
|
|
def get_xiaozhao_list(self):
|
|
"""获取校园招聘列表"""
|
|
return self._get_job_list(self.CATEGORY_MAP["xiaozhao"])
|
|
|
|
def get_shixi_list(self):
|
|
"""获取实习招聘列表(该网站无实习分类)"""
|
|
return {"total": 0, "records": [], "position_ids": []}
|
|
|
|
def get_position_detail(self, job_id):
|
|
"""获取岗位详情"""
|
|
url = f"{self.BASE_URL}/jobdetail/{job_id}"
|
|
resp = self.session.get(url)
|
|
resp.raise_for_status()
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
main = soup.find("main") or soup
|
|
|
|
title_tag = soup.find("title")
|
|
job_title = title_tag.get_text().replace("-中科类脑", "").strip() if title_tag else ""
|
|
|
|
paragraphs = main.find_all("p")
|
|
description_parts = []
|
|
for p in paragraphs:
|
|
text = p.get_text(strip=True)
|
|
if text and len(text) > 5:
|
|
description_parts.append(text)
|
|
|
|
h6_tags = main.find_all("h6")
|
|
location = None
|
|
salary = None
|
|
experience = None
|
|
for h6 in h6_tags:
|
|
text = h6.get_text(strip=True).replace("\xa0", " ")
|
|
if "薪资" in text or "工作经验" in text:
|
|
parts = [p.strip() for p in text.split() if p.strip()]
|
|
for part in parts:
|
|
if "薪资:" in part:
|
|
sal = part.replace("薪资:", "")
|
|
if sal and sal != "面议":
|
|
salary = sal
|
|
elif "工作经验:" in part:
|
|
exp = part.replace("工作经验:", "")
|
|
if exp and exp != "不限":
|
|
experience = exp
|
|
elif "·" in part and "不限" not in part:
|
|
location = part
|
|
|
|
return {
|
|
"job_id": job_id,
|
|
"job_title": job_title,
|
|
"description": "\n".join(description_parts),
|
|
"location": location,
|
|
"salary": salary,
|
|
"experience": experience,
|
|
"detail_url": f"{self.BASE_URL}/jobdetail/{job_id}",
|
|
}
|
|
|
|
def crawl_all(self):
|
|
"""爬取所有岗位列表+详情"""
|
|
all_records = []
|
|
|
|
for name, cat_id in self.CATEGORY_MAP.items():
|
|
job_list = self._get_job_list(cat_id)
|
|
recruit_cat = CATEGORY_DB_MAP.get(cat_id, 0)
|
|
print(f"[{name}] 共 {job_list['total']} 个岗位")
|
|
|
|
for r in job_list["records"]:
|
|
time.sleep(0.3)
|
|
detail = self.get_position_detail(r["job_id"])
|
|
detail["recruit_category"] = recruit_cat
|
|
detail["post_date"] = r.get("post_date")
|
|
all_records.append(detail)
|
|
|
|
return all_records
|
|
|
|
|
|
if __name__ == "__main__":
|
|
crawler = LeinaoJobCrawler()
|
|
|
|
print("=" * 60)
|
|
print("中科类脑 - 招聘岗位爬取")
|
|
print("=" * 60)
|
|
|
|
all_details = crawler.crawl_all()
|
|
|
|
print(f"\n[数据清洗]")
|
|
task_crawl_id = 0
|
|
parsed = parse_to_db(all_details, task_crawl_id)
|
|
print(f" 清洗完成: {len(parsed)} 条")
|
|
|
|
if parsed:
|
|
print("\n--- 样例 ---")
|
|
for k, v in parsed[0].items():
|
|
print(f" {k}: {str(v)[:100]}")
|