post_crawler/crawl/leinao/leinao_crawler.py

import requests
import hashlib
import time
from datetime import datetime
from bs4 import BeautifulSoup


CATEGORY_DB_MAP = {
    "7": 0,  # /job/7 社招
    "8": 1,  # /job/8 校招
}


def parse_to_db(records, task_crawl_id, company_id="leinao", company="中科类脑"):
    """将解析后的岗位数据清洗为 app_job_data 表所需格式"""
    results = []
    for r in records:
        job_title = (r.get("job_title") or "").strip()
        if not job_title:
            continue

        description = r.get("description") or ""
        detail_url = r.get("detail_url") or ""
        recruit_category = r.get("recruit_category", 0)

        content_hash = hashlib.md5(
            f"{job_title}|{company_id}|{description}".encode("utf-8")
        ).hexdigest()

        item = {
            "task_crawl_id": task_crawl_id,
            "job_title": job_title,
            "company_id": company_id,
            "company": company,
            "detail_url": detail_url,
            "recruit_category": recruit_category,
            "content_hash": content_hash,
        }

        if r.get("location"):
            item["location"] = r["location"]
        if r.get("salary"):
            item["salary"] = r["salary"]
        if r.get("education"):
            item["education"] = r["education"]
        if r.get("experience"):
            item["experience"] = r["experience"]
        if description:
            item["description"] = description

        post_date = r.get("post_date") or ""
        if post_date:
            item["expire_at"] = post_date + " 00:00:00"
        else:
            item["expire_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        results.append(item)
    return results


class LeinaoJobCrawler:
    """中科类脑招聘官网爬虫 (www.leinao.ai)"""

    BASE_URL = "https://www.leinao.ai"
    CATEGORY_MAP = {
        "shezhao": "7",
        "xiaozhao": "8",
    }

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36",
        })

    def _get_job_list(self, category_id):
        """获取岗位列表页，解析HTML"""
        url = f"{self.BASE_URL}/job/{category_id}"
        resp = self.session.get(url)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")
        links = soup.find_all("a", href=lambda h: h and "/jobdetail/" in h)

        records = []
        for link in links:
            href = link.get("href", "")
            job_id = href.split("/")[-1]
            cells = link.find_all(["div", "span", "p", "generic"])
            texts = [t.get_text(strip=True) for t in link.children if hasattr(t, 'get_text')]
            all_text = link.get_text(separator="|", strip=True).split("|")
            all_text = [t for t in all_text if t]

            record = {
                "job_id": job_id,
                "detail_url": f"{self.BASE_URL}{href}",
                "recruit_category": CATEGORY_DB_MAP.get(category_id, 0),
            }

            if len(all_text) >= 1:
                record["job_title"] = all_text[0]
            if len(all_text) >= 2:
                loc = all_text[1]
                record["location"] = loc if loc != "不限" else None
            if len(all_text) >= 3:
                record["job_type"] = all_text[2]
            if len(all_text) >= 4:
                record["category_name"] = all_text[3]
            if len(all_text) >= 5:
                record["post_date"] = all_text[4]
            if len(all_text) >= 6:
                record["org"] = all_text[5]

            records.append(record)

        position_ids = [r["job_id"] for r in records]
        return {"total": len(records), "records": records, "position_ids": position_ids}

    def get_shezhao_list(self):
        """获取社会招聘列表"""
        return self._get_job_list(self.CATEGORY_MAP["shezhao"])

    def get_xiaozhao_list(self):
        """获取校园招聘列表"""
        return self._get_job_list(self.CATEGORY_MAP["xiaozhao"])

    def get_shixi_list(self):
        """获取实习招聘列表（该网站无实习分类）"""
        return {"total": 0, "records": [], "position_ids": []}

    def get_position_detail(self, job_id):
        """获取岗位详情"""
        url = f"{self.BASE_URL}/jobdetail/{job_id}"
        resp = self.session.get(url)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")
        main = soup.find("main") or soup

        title_tag = soup.find("title")
        job_title = title_tag.get_text().replace("-中科类脑", "").strip() if title_tag else ""

        paragraphs = main.find_all("p")
        description_parts = []
        for p in paragraphs:
            text = p.get_text(strip=True)
            if text and len(text) > 5:
                description_parts.append(text)

        h6_tags = main.find_all("h6")
        location = None
        salary = None
        experience = None
        for h6 in h6_tags:
            text = h6.get_text(strip=True).replace("\xa0", " ")
            if "薪资" in text or "工作经验" in text:
                parts = [p.strip() for p in text.split() if p.strip()]
                for part in parts:
                    if "薪资：" in part:
                        sal = part.replace("薪资：", "")
                        if sal and sal != "面议":
                            salary = sal
                    elif "工作经验：" in part:
                        exp = part.replace("工作经验：", "")
                        if exp and exp != "不限":
                            experience = exp
                    elif "·" in part and "不限" not in part:
                        location = part

        return {
            "job_id": job_id,
            "job_title": job_title,
            "description": "\n".join(description_parts),
            "location": location,
            "salary": salary,
            "experience": experience,
            "detail_url": f"{self.BASE_URL}/jobdetail/{job_id}",
        }

    def crawl_all(self):
        """爬取所有岗位列表+详情"""
        all_records = []

        for name, cat_id in self.CATEGORY_MAP.items():
            job_list = self._get_job_list(cat_id)
            recruit_cat = CATEGORY_DB_MAP.get(cat_id, 0)
            print(f"[{name}] 共 {job_list['total']} 个岗位")

            for r in job_list["records"]:
                time.sleep(0.3)
                detail = self.get_position_detail(r["job_id"])
                detail["recruit_category"] = recruit_cat
                detail["post_date"] = r.get("post_date")
                all_records.append(detail)

        return all_records


if __name__ == "__main__":
    crawler = LeinaoJobCrawler()

    print("=" * 60)
    print("中科类脑 - 招聘岗位爬取")
    print("=" * 60)

    all_details = crawler.crawl_all()

    print(f"\n[数据清洗]")
    task_crawl_id = 0
    parsed = parse_to_db(all_details, task_crawl_id)
    print(f"  清洗完成: {len(parsed)} 条")

    if parsed:
        print("\n--- 样例 ---")
        for k, v in parsed[0].items():
            print(f"  {k}: {str(v)[:100]}")