import requests import hashlib import time from datetime import datetime from bs4 import BeautifulSoup CATEGORY_DB_MAP = { "7": 0, # /job/7 社招 "8": 1, # /job/8 校招 } def parse_to_db(records, task_crawl_id, company_id="leinao", company="中科类脑"): """将解析后的岗位数据清洗为 app_job_data 表所需格式""" results = [] for r in records: job_title = (r.get("job_title") or "").strip() if not job_title: continue description = r.get("description") or "" detail_url = r.get("detail_url") or "" recruit_category = r.get("recruit_category", 0) content_hash = hashlib.md5( f"{job_title}|{company_id}|{description}".encode("utf-8") ).hexdigest() item = { "task_crawl_id": task_crawl_id, "job_title": job_title, "company_id": company_id, "company": company, "detail_url": detail_url, "recruit_category": recruit_category, "content_hash": content_hash, } if r.get("location"): item["location"] = r["location"] if r.get("salary"): item["salary"] = r["salary"] if r.get("education"): item["education"] = r["education"] if r.get("experience"): item["experience"] = r["experience"] if description: item["description"] = description post_date = r.get("post_date") or "" if post_date: item["expire_at"] = post_date + " 00:00:00" else: item["expire_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") results.append(item) return results class LeinaoJobCrawler: """中科类脑招聘官网爬虫 (www.leinao.ai)""" BASE_URL = "https://www.leinao.ai" CATEGORY_MAP = { "shezhao": "7", "xiaozhao": "8", } def __init__(self): self.session = requests.Session() self.session.headers.update({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36", }) def _get_job_list(self, category_id): """获取岗位列表页,解析HTML""" url = f"{self.BASE_URL}/job/{category_id}" resp = self.session.get(url) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") links = soup.find_all("a", href=lambda h: h and "/jobdetail/" in h) records = [] for link in links: href = link.get("href", "") job_id = href.split("/")[-1] cells = link.find_all(["div", "span", "p", "generic"]) texts = [t.get_text(strip=True) for t in link.children if hasattr(t, 'get_text')] all_text = link.get_text(separator="|", strip=True).split("|") all_text = [t for t in all_text if t] record = { "job_id": job_id, "detail_url": f"{self.BASE_URL}{href}", "recruit_category": CATEGORY_DB_MAP.get(category_id, 0), } if len(all_text) >= 1: record["job_title"] = all_text[0] if len(all_text) >= 2: loc = all_text[1] record["location"] = loc if loc != "不限" else None if len(all_text) >= 3: record["job_type"] = all_text[2] if len(all_text) >= 4: record["category_name"] = all_text[3] if len(all_text) >= 5: record["post_date"] = all_text[4] if len(all_text) >= 6: record["org"] = all_text[5] records.append(record) position_ids = [r["job_id"] for r in records] return {"total": len(records), "records": records, "position_ids": position_ids} def get_shezhao_list(self): """获取社会招聘列表""" return self._get_job_list(self.CATEGORY_MAP["shezhao"]) def get_xiaozhao_list(self): """获取校园招聘列表""" return self._get_job_list(self.CATEGORY_MAP["xiaozhao"]) def get_shixi_list(self): """获取实习招聘列表(该网站无实习分类)""" return {"total": 0, "records": [], "position_ids": []} def get_position_detail(self, job_id): """获取岗位详情""" url = f"{self.BASE_URL}/jobdetail/{job_id}" resp = self.session.get(url) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") main = soup.find("main") or soup title_tag = soup.find("title") job_title = title_tag.get_text().replace("-中科类脑", "").strip() if title_tag else "" paragraphs = main.find_all("p") description_parts = [] for p in paragraphs: text = p.get_text(strip=True) if text and len(text) > 5: description_parts.append(text) h6_tags = main.find_all("h6") location = None salary = None experience = None for h6 in h6_tags: text = h6.get_text(strip=True).replace("\xa0", " ") if "薪资" in text or "工作经验" in text: parts = [p.strip() for p in text.split() if p.strip()] for part in parts: if "薪资:" in part: sal = part.replace("薪资:", "") if sal and sal != "面议": salary = sal elif "工作经验:" in part: exp = part.replace("工作经验:", "") if exp and exp != "不限": experience = exp elif "·" in part and "不限" not in part: location = part return { "job_id": job_id, "job_title": job_title, "description": "\n".join(description_parts), "location": location, "salary": salary, "experience": experience, "detail_url": f"{self.BASE_URL}/jobdetail/{job_id}", } def crawl_all(self): """爬取所有岗位列表+详情""" all_records = [] for name, cat_id in self.CATEGORY_MAP.items(): job_list = self._get_job_list(cat_id) recruit_cat = CATEGORY_DB_MAP.get(cat_id, 0) print(f"[{name}] 共 {job_list['total']} 个岗位") for r in job_list["records"]: time.sleep(0.3) detail = self.get_position_detail(r["job_id"]) detail["recruit_category"] = recruit_cat detail["post_date"] = r.get("post_date") all_records.append(detail) return all_records if __name__ == "__main__": crawler = LeinaoJobCrawler() print("=" * 60) print("中科类脑 - 招聘岗位爬取") print("=" * 60) all_details = crawler.crawl_all() print(f"\n[数据清洗]") task_crawl_id = 0 parsed = parse_to_db(all_details, task_crawl_id) print(f" 清洗完成: {len(parsed)} 条") if parsed: print("\n--- 样例 ---") for k, v in parsed[0].items(): print(f" {k}: {str(v)[:100]}")