Files
post_crawler/test_crawler.py
T
kgod b3f35dbdea feat: Implement job scheduling and company recruitment search functionality
- Added a new scheduler module to manage periodic jobs for recruitment data processing.
- Created a search_company_graph module to handle the logic for searching company recruitment pages.
- Implemented nodes for searching, extracting links, verifying recruitment lists, and navigating to recruitment pages.
- Developed prompts for LLM to guide the extraction and verification processes.
- Added state management for tracking the search process and results.
- Created a test script for crawling job listings from various company websites.
2026-05-26 21:03:24 +08:00

63 lines
2.3 KiB
Python

"""Crawler 测试脚本"""
import asyncio
from src.crawler import crawl, CrawlerConfig
# 中兴招聘测试配置
# test_config: CrawlerConfig = {
# "url": "https://app.mokahr.com/social-recruitment/zte/47588#/jobs",
# "job_item_selector": ".jobs-list-WmE84RgZxp .container-aOp138AX_X.normal-TBuWTpDMcE.list-oR2doUijv4",
# "item_change_type": "redirect",
# "next_page_selector": ".sd-Pagination-pagination-2kuN2 .sd-Pagination-forward-3z80f",
# "page_change_type": "url_change",
# "field_selectors": {
# "job_title": {"selector": [".title-ROUQFdjmhP"]},
# "description": {"selector": [".job-description-VvfEUGocNE"]},
# "location": {"selector": [".info-UcB_mxJq8y span:first-child"]},
# "company": {"selector": [".basic-info-dB86EjV5uU span:nth-child(2)"]},
# },
# "detail_area_selector": None,
# }
# 美宜佳招聘测试配置
# test_config: CrawlerConfig = {
# "url": "https://meiyijia.jobs.feishu.cn/social/position/list",
# "job_item_selector": ".listItems__fca8c0 a",
# "item_change_type": "new_tab",
# "next_page_selector": ".pager__fca8c0 .atsx-pagination-next:not(.atsx-pagination-disabled)",
# "page_change_type": "url_change",
# "field_selectors": {
# "job_title": {"selector": [".positionItem-title-text"]},
# "description": {"selector": [".positionItem-jobDesc"]},
# "location": {"selector": [".positionItem-subTitle span"]},
# },
# "detail_area_selector": None,
# }
# 三星招聘测试配置
test_config: CrawlerConfig = {
"url": "https://dearsamsung.zhiye.com/#/samsung/pc/szzw",
"job_item_selector": ".BHGkB li",
"item_change_type": "in_page",
"next_page_selector": "._8x6MD .ant-pagination-next:not([aria-disabled='true']) .ant-pagination-item-link",
"page_change_type": "content_change",
"field_selectors": {
"job_title": {"selector": ["h2"]},
"description": {"selector": ['.aCl-8 p', '.aCl-8 pre']},
},
"detail_area_selector": ".FLf6j",
}
async def main():
results = await crawl(test_config, headless=False)
print(f"\n爬取完成,共 {len(results)} 条数据")
for i, item in enumerate(results):
print(f"\n--- 岗位 {i+1} ---")
for k, v in item.items():
print(f"{k}: {v}")
if __name__ == "__main__":
asyncio.run(main())