generated from kgod/ai-review-template
357 lines
8.9 KiB
Markdown
357 lines
8.9 KiB
Markdown
# Crawler 模块设计方案
|
||
|
||
## 状态
|
||
✅ 已完成
|
||
|
||
---
|
||
|
||
## 一、输入参数
|
||
|
||
| 参数 | 来源 | 说明 |
|
||
|-----|------|-----|
|
||
| url | 初始输入 | 列表页 URL |
|
||
| job_item_selector | Step 2 | 岗位项选择器 |
|
||
| item_change_type | Step 2 | redirect / new_tab / in_page |
|
||
| next_page_selector | Step 3 | 下一页按钮选择器(None = 无分页) |
|
||
| page_change_type | Step 3 | url_change / content_change / new_tab |
|
||
| field_selectors | Step 4 | 字段选择器字典,selector 为数组格式 |
|
||
| detail_area_selector | Step 4 | 详情区域选择器(仅 in_page 有值) |
|
||
|
||
---
|
||
|
||
## 二、限制条件
|
||
|
||
- 最大页数:5 页
|
||
- 单个岗位失败:跳过继续
|
||
- 超时时间:可配置(如 10 秒)
|
||
|
||
---
|
||
|
||
## 三、数据提取范围
|
||
|
||
| item_change_type | 提取范围 | detail_url 记录 |
|
||
|-----------------|---------|---------------|
|
||
| redirect | 跳转后的详情页整页 | 详情页 URL |
|
||
| new_tab | 新标签详情页整页 | 新标签页 URL |
|
||
| in_page | detail_area_selector 内部 | 列表页 URL |
|
||
|
||
---
|
||
|
||
## 四、入口逻辑
|
||
|
||
```
|
||
results = []
|
||
|
||
IF item_change_type == "redirect":
|
||
results = 流程A_redirect()
|
||
ELSE IF item_change_type == "new_tab":
|
||
results = 流程B_new_tab()
|
||
ELSE IF item_change_type == "in_page":
|
||
results = 流程C_in_page()
|
||
|
||
RETURN results
|
||
```
|
||
|
||
---
|
||
|
||
## 五、流程 A:redirect
|
||
|
||
**特点**:点击岗位后整页跳转,返回后状态丢失,需重新打开并翻页恢复
|
||
|
||
```
|
||
初始化:
|
||
page_index = 1
|
||
item_index = 1
|
||
results = []
|
||
|
||
打开 url
|
||
等待加载完成
|
||
items_per_page = 用 job_item_selector 计算当前页岗位数
|
||
|
||
主循环:
|
||
WHILE True:
|
||
|
||
# ===== 1. 检查是否结束 =====
|
||
IF items_per_page == 0:
|
||
BREAK
|
||
IF page_index > 5:
|
||
BREAK
|
||
|
||
# ===== 2. 处理当前岗位 =====
|
||
TRY:
|
||
elements = 获取所有岗位元素(job_item_selector)
|
||
点击 elements[item_index - 1]
|
||
等待页面跳转完成(URL 变化 或 networkidle)
|
||
|
||
data = 在整页范围用 field_selectors 提取数据
|
||
data["detail_url"] = 当前页面 URL
|
||
results.append(data)
|
||
CATCH:
|
||
记录失败,跳过
|
||
|
||
# ===== 3. 计算下一个位置 =====
|
||
item_index += 1
|
||
|
||
IF item_index > items_per_page:
|
||
page_index += 1
|
||
item_index = 1
|
||
|
||
IF page_index > 5:
|
||
BREAK
|
||
IF next_page_selector == None:
|
||
BREAK
|
||
|
||
# ===== 4. 重新打开并恢复到目标页 =====
|
||
goto(url)
|
||
等待加载完成
|
||
|
||
FOR i = 1 TO (page_index - 1):
|
||
success, page = click_next_page(page)
|
||
IF not success:
|
||
BREAK 主循环
|
||
|
||
items_per_page = 重新计算当前页岗位数
|
||
|
||
RETURN results
|
||
```
|
||
|
||
---
|
||
|
||
## 六、流程 B:new_tab
|
||
|
||
**特点**:点击后新标签打开详情,关闭新标签后原页面状态保持
|
||
|
||
```
|
||
初始化:
|
||
page_index = 1
|
||
results = []
|
||
|
||
打开 url
|
||
等待加载完成
|
||
|
||
主循环:
|
||
WHILE page_index <= 5:
|
||
|
||
# ===== 1. 获取当前页岗位 =====
|
||
items_per_page = 用 job_item_selector 计算当前页岗位数
|
||
IF items_per_page == 0:
|
||
BREAK
|
||
|
||
# ===== 2. 遍历当前页所有岗位 =====
|
||
FOR item_index = 1 TO items_per_page:
|
||
TRY:
|
||
elements = 获取所有岗位元素(job_item_selector)
|
||
|
||
开始监听 context 的 "page" 事件
|
||
点击 elements[item_index - 1]
|
||
|
||
等待新标签页打开(超时则跳过)
|
||
new_page = 获取新打开的标签页
|
||
等待 new_page 加载完成
|
||
|
||
data = 在 new_page 整页范围用 field_selectors 提取数据
|
||
data["detail_url"] = new_page.url
|
||
results.append(data)
|
||
|
||
关闭 new_page
|
||
CATCH:
|
||
尝试关闭可能存在的新标签
|
||
记录失败,跳过
|
||
|
||
# ===== 3. 翻页 =====
|
||
IF next_page_selector == None:
|
||
BREAK
|
||
|
||
success, page = click_next_page(page)
|
||
IF not success:
|
||
BREAK
|
||
|
||
page_index += 1
|
||
|
||
RETURN results
|
||
```
|
||
|
||
---
|
||
|
||
## 七、流程 C:in_page
|
||
|
||
**特点**:点击后弹窗/详情区展示,通过刷新页面恢复状态,兼容性更强
|
||
|
||
```
|
||
初始化:
|
||
page_index = 1
|
||
item_index = 1
|
||
results = []
|
||
|
||
打开 url
|
||
等待加载完成
|
||
items_per_page = 用 job_item_selector 计算当前页岗位数
|
||
|
||
主循环:
|
||
WHILE True:
|
||
|
||
# ===== 1. 检查是否结束 =====
|
||
IF items_per_page == 0:
|
||
BREAK
|
||
IF page_index > 5:
|
||
BREAK
|
||
|
||
# ===== 2. 处理当前岗位 =====
|
||
TRY:
|
||
elements = 获取所有岗位元素(job_item_selector)
|
||
点击 elements[item_index - 1]
|
||
等待 detail_area_selector 出现(visible)
|
||
|
||
detail_element = 获取 detail_area_selector 元素
|
||
data = 在 detail_element 内部用 field_selectors 提取数据
|
||
data["detail_url"] = 当前页面 URL
|
||
results.append(data)
|
||
CATCH:
|
||
记录失败,跳过
|
||
|
||
# ===== 3. 计算下一个位置 =====
|
||
item_index += 1
|
||
|
||
IF item_index > items_per_page:
|
||
page_index += 1
|
||
item_index = 1
|
||
|
||
IF page_index > 5:
|
||
BREAK
|
||
IF next_page_selector == None:
|
||
BREAK
|
||
|
||
# ===== 4. 刷新页面恢复状态 =====
|
||
goto(url)
|
||
等待加载完成
|
||
|
||
FOR i = 1 TO (page_index - 1):
|
||
success, page = click_next_page(page)
|
||
IF not success:
|
||
BREAK 主循环
|
||
|
||
items_per_page = 重新计算当前页岗位数
|
||
|
||
RETURN results
|
||
```
|
||
|
||
---
|
||
|
||
## 八、统一翻页函数
|
||
|
||
**返回值**:(success: bool, page: Page) —— new_tab 模式时 page 引用会变
|
||
|
||
```
|
||
函数 click_next_page(page, next_page_selector, page_change_type):
|
||
|
||
# ===== 1. 检查按钮是否可点 =====
|
||
element = 查找 next_page_selector
|
||
|
||
IF element 不存在:
|
||
RETURN (false, page)
|
||
|
||
IF element 有 disabled 属性:
|
||
RETURN (false, page)
|
||
|
||
IF element 的 class 包含 "disabled":
|
||
RETURN (false, page)
|
||
|
||
IF element 不可见 (is_visible == false):
|
||
RETURN (false, page)
|
||
|
||
# ===== 2. 根据 page_change_type 处理 =====
|
||
|
||
IF page_change_type == "url_change":
|
||
before_url = page.url
|
||
点击 element
|
||
等待 URL 变化(page.url != before_url)
|
||
等待加载完成(networkidle)
|
||
RETURN (true, page)
|
||
|
||
ELSE IF page_change_type == "content_change":
|
||
before_text = 获取第一个岗位的文本内容
|
||
点击 element
|
||
等待第一个岗位文本变化(!= before_text)
|
||
短暂等待确保渲染完成
|
||
RETURN (true, page)
|
||
|
||
ELSE IF page_change_type == "new_tab":
|
||
开始监听 context 的 "page" 事件
|
||
点击 element
|
||
等待新标签页打开
|
||
new_page = 获取新标签页
|
||
等待 new_page 加载完成
|
||
关闭原 page
|
||
RETURN (true, new_page) # 返回新的 page 引用
|
||
```
|
||
|
||
---
|
||
|
||
## 九、数据提取函数
|
||
|
||
```
|
||
函数 extract_data(scope, field_selectors):
|
||
# scope: 整个 page 或 detail_element
|
||
|
||
data = {}
|
||
|
||
FOR field_name, selector_info IN field_selectors:
|
||
TRY:
|
||
selectors = selector_info.selector # 数组格式
|
||
IF selectors 为空数组:
|
||
data[field_name] = None
|
||
CONTINUE
|
||
|
||
# 遍历所有选择器,提取文本并拼接
|
||
texts = []
|
||
FOR selector IN selectors:
|
||
element = scope.query_selector(selector)
|
||
IF element:
|
||
text = element.inner_text().strip()
|
||
IF text:
|
||
texts.append(text)
|
||
|
||
data[field_name] = "\n".join(texts) IF texts ELSE None
|
||
CATCH:
|
||
data[field_name] = None
|
||
|
||
RETURN data
|
||
```
|
||
|
||
---
|
||
|
||
## 十、字段选择器格式
|
||
|
||
`field_selectors` 统一使用数组格式:
|
||
|
||
```
|
||
{
|
||
"job_title": {"selector": ["h1.title"], "sample": "Python工程师"},
|
||
"description": {"selector": [".desc", ".req"], "sample": "负责...\n要求..."},
|
||
"salary": {"selector": [".salary"], "sample": "25-40K"},
|
||
...
|
||
}
|
||
```
|
||
|
||
- 单选择器:`["h1.title"]`
|
||
- 多选择器:`[".desc", ".req"]`,提取结果用 `\n` 拼接
|
||
|
||
---
|
||
|
||
## 十一、输出格式
|
||
|
||
```
|
||
[
|
||
{
|
||
"job_title": "...",
|
||
"salary": "...",
|
||
"location": "...",
|
||
"description": "...",
|
||
"requirements": "...",
|
||
"detail_url": "...",
|
||
...
|
||
},
|
||
...
|
||
]
|
||
```
|