generated from kgod/ai-review-template
308 lines
7.2 KiB
Markdown
308 lines
7.2 KiB
Markdown
# Playwright Python 使用指南
|
||
|
||
本文档覆盖项目中会用到的 Playwright 核心功能。
|
||
|
||
## 安装
|
||
|
||
```bash
|
||
pip install playwright
|
||
playwright install chromium # 安装浏览器
|
||
```
|
||
|
||
## 基本结构
|
||
|
||
```python
|
||
from playwright.async_api import async_playwright
|
||
|
||
async def main():
|
||
async with async_playwright() as p:
|
||
# 启动浏览器
|
||
browser = await p.chromium.launch(headless=False) # False 可看到浏览器界面
|
||
|
||
# 创建页面
|
||
page = await browser.new_page()
|
||
|
||
# 操作页面...
|
||
|
||
# 关闭
|
||
await browser.close()
|
||
```
|
||
|
||
## 核心操作
|
||
|
||
### 1. 导航
|
||
|
||
```python
|
||
# 访问 URL
|
||
await page.goto("https://example.com")
|
||
|
||
# 等待加载完成(可选策略)
|
||
await page.goto("https://example.com", wait_until="networkidle") # 网络空闲
|
||
await page.goto("https://example.com", wait_until="domcontentloaded") # DOM 加载完成
|
||
|
||
# 获取当前 URL
|
||
current_url = page.url
|
||
```
|
||
|
||
### 2. 获取内容
|
||
|
||
```python
|
||
# 获取整个页面 HTML
|
||
html = await page.content()
|
||
|
||
# 获取 body 内部 HTML
|
||
body_html = await page.inner_html("body")
|
||
|
||
# 获取元素内部 HTML
|
||
area_html = await page.inner_html(".job-list")
|
||
|
||
# 获取元素外部 HTML(包含自身标签)
|
||
outer = await page.evaluate("document.querySelector('.job-item').outerHTML")
|
||
|
||
# 获取文本内容
|
||
text = await page.inner_text(".title")
|
||
|
||
# 获取页面标题
|
||
title = await page.title()
|
||
```
|
||
|
||
### 3. 选择器与元素定位
|
||
|
||
```python
|
||
# 单个元素
|
||
element = await page.query_selector(".job-item") # 返回 ElementHandle 或 None
|
||
|
||
# 多个元素
|
||
elements = await page.query_selector_all(".job-item") # 返回列表
|
||
count = len(elements)
|
||
|
||
# 检查元素是否存在
|
||
if await page.query_selector(".job-item"):
|
||
print("存在")
|
||
|
||
# Locator API(推荐,更稳定)
|
||
locator = page.locator(".job-item")
|
||
count = await locator.count()
|
||
first = locator.first
|
||
nth = locator.nth(2) # 第3个元素
|
||
```
|
||
|
||
### 4. 点击与交互
|
||
|
||
```python
|
||
# 点击元素
|
||
await page.click(".job-item")
|
||
|
||
# 点击第一个匹配的元素
|
||
await page.locator(".job-item").first.click()
|
||
|
||
# 点击第 N 个元素
|
||
await page.locator(".job-item").nth(0).click()
|
||
|
||
# 带等待的点击
|
||
await page.click(".job-item", timeout=5000) # 最多等 5 秒
|
||
|
||
# 输入文本
|
||
await page.fill("input[name='search']", "关键词")
|
||
|
||
# 按键
|
||
await page.keyboard.press("Enter")
|
||
```
|
||
|
||
### 5. 等待
|
||
|
||
```python
|
||
import asyncio
|
||
|
||
# 简单等待(秒)
|
||
await asyncio.sleep(2)
|
||
|
||
# 等待选择器出现
|
||
await page.wait_for_selector(".job-list", timeout=10000)
|
||
|
||
# 等待选择器消失
|
||
await page.wait_for_selector(".loading", state="hidden")
|
||
|
||
# 等待导航完成
|
||
async with page.expect_navigation():
|
||
await page.click(".next-page")
|
||
|
||
# 等待网络空闲
|
||
await page.wait_for_load_state("networkidle")
|
||
```
|
||
|
||
### 6. 执行 JavaScript
|
||
|
||
```python
|
||
# 简单表达式
|
||
result = await page.evaluate("document.title")
|
||
|
||
# 带参数
|
||
selector = ".job-item"
|
||
count = await page.evaluate(f"document.querySelectorAll('{selector}').length")
|
||
|
||
# 复杂逻辑
|
||
result = await page.evaluate("""
|
||
() => {
|
||
const items = document.querySelectorAll('.job-item');
|
||
return Array.from(items).map(el => el.outerHTML);
|
||
}
|
||
""")
|
||
|
||
# 在元素上执行
|
||
element = await page.query_selector(".job-item")
|
||
html = await element.evaluate("el => el.outerHTML")
|
||
```
|
||
|
||
### 7. 截图
|
||
|
||
```python
|
||
# 整页截图
|
||
await page.screenshot(path="screenshot.png")
|
||
|
||
# 元素截图
|
||
element = await page.query_selector(".job-list")
|
||
await element.screenshot(path="element.png")
|
||
|
||
# 全页面(包括滚动区域)
|
||
await page.screenshot(path="full.png", full_page=True)
|
||
```
|
||
|
||
## 常用模式
|
||
|
||
### 模式1:获取多个元素的 HTML
|
||
|
||
```python
|
||
# 方法1:evaluate
|
||
htmls = await page.evaluate("""
|
||
selector => {
|
||
const items = document.querySelectorAll(selector);
|
||
return Array.from(items).slice(0, 3).map(el => el.outerHTML);
|
||
}
|
||
""", ".job-item")
|
||
|
||
# 方法2:遍历 ElementHandle
|
||
elements = await page.query_selector_all(".job-item")
|
||
htmls = []
|
||
for el in elements[:3]:
|
||
html = await el.evaluate("el => el.outerHTML")
|
||
htmls.append(html)
|
||
```
|
||
|
||
### 模式2:检测页面变化
|
||
|
||
```python
|
||
before_url = page.url
|
||
|
||
await page.click(".job-item")
|
||
await asyncio.sleep(2)
|
||
|
||
after_url = page.url
|
||
|
||
if after_url != before_url:
|
||
print("发生了跳转")
|
||
```
|
||
|
||
### 模式3:带重试的选择器验证
|
||
|
||
```python
|
||
async def validate_selector(page, selector: str) -> int:
|
||
"""验证选择器,返回匹配数量"""
|
||
try:
|
||
elements = await page.query_selector_all(selector)
|
||
return len(elements)
|
||
except Exception:
|
||
return 0
|
||
```
|
||
|
||
### 模式4:安全获取元素内容
|
||
|
||
```python
|
||
async def safe_inner_html(page, selector: str) -> str | None:
|
||
"""安全获取元素 innerHTML"""
|
||
element = await page.query_selector(selector)
|
||
if element:
|
||
return await element.inner_html()
|
||
return None
|
||
```
|
||
|
||
## 浏览器配置
|
||
|
||
```python
|
||
# 有头模式(可见)
|
||
browser = await p.chromium.launch(headless=False)
|
||
|
||
# 无头模式(后台运行)
|
||
browser = await p.chromium.launch(headless=True)
|
||
|
||
# 慢动作(调试用)
|
||
browser = await p.chromium.launch(headless=False, slow_mo=500) # 每步慢 500ms
|
||
|
||
# 设置窗口大小
|
||
context = await browser.new_context(viewport={"width": 1280, "height": 720})
|
||
page = await context.new_page()
|
||
|
||
# 设置 User-Agent
|
||
context = await browser.new_context(
|
||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||
)
|
||
```
|
||
|
||
## 错误处理
|
||
|
||
```python
|
||
from playwright.async_api import TimeoutError as PlaywrightTimeout
|
||
|
||
try:
|
||
await page.click(".not-exist", timeout=3000)
|
||
except PlaywrightTimeout:
|
||
print("元素未找到")
|
||
except Exception as e:
|
||
print(f"其他错误: {e}")
|
||
```
|
||
|
||
## 项目中的典型用法
|
||
|
||
```python
|
||
from playwright.async_api import async_playwright
|
||
|
||
async def analyze_page(url: str):
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(headless=False)
|
||
page = await browser.new_page()
|
||
|
||
try:
|
||
# 1. 访问页面
|
||
await page.goto(url, wait_until="networkidle")
|
||
|
||
# 2. 获取 HTML
|
||
html = await page.inner_html("body")
|
||
|
||
# 3. 验证选择器
|
||
selector = ".job-item"
|
||
elements = await page.query_selector_all(selector)
|
||
print(f"匹配到 {len(elements)} 个元素")
|
||
|
||
# 4. 点击验证
|
||
if elements:
|
||
before_url = page.url
|
||
await elements[0].click()
|
||
await asyncio.sleep(2)
|
||
after_url = page.url
|
||
print(f"跳转: {before_url} -> {after_url}")
|
||
|
||
finally:
|
||
await browser.close()
|
||
```
|
||
|
||
## 与 MCP 对比
|
||
|
||
| 操作 | MCP 方式 | 直接 Playwright |
|
||
|------|----------|-----------------|
|
||
| 导航 | `await call_tool(tools, "browser_navigate", url=url)` | `await page.goto(url)` |
|
||
| 获取 HTML | `await call_tool(tools, "browser_evaluate", function="document.body.innerHTML")` | `await page.inner_html("body")` |
|
||
| 点击 | `await call_tool(tools, "browser_evaluate", function="document.querySelector('.x').click()")` | `await page.click(".x")` |
|
||
| 获取 URL | `await call_tool(tools, "browser_evaluate", function="window.location.href")` | `page.url` |
|
||
|
||
直接用 Playwright 代码更简洁、更 Pythonic。
|