Files
post_crawler/docs/playwright_guide.md
T
2026-05-26 21:02:17 +08:00

308 lines
7.2 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Playwright Python 使用指南
本文档覆盖项目中会用到的 Playwright 核心功能。
## 安装
```bash
pip install playwright
playwright install chromium # 安装浏览器
```
## 基本结构
```python
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
# 启动浏览器
browser = await p.chromium.launch(headless=False) # False 可看到浏览器界面
# 创建页面
page = await browser.new_page()
# 操作页面...
# 关闭
await browser.close()
```
## 核心操作
### 1. 导航
```python
# 访问 URL
await page.goto("https://example.com")
# 等待加载完成(可选策略)
await page.goto("https://example.com", wait_until="networkidle") # 网络空闲
await page.goto("https://example.com", wait_until="domcontentloaded") # DOM 加载完成
# 获取当前 URL
current_url = page.url
```
### 2. 获取内容
```python
# 获取整个页面 HTML
html = await page.content()
# 获取 body 内部 HTML
body_html = await page.inner_html("body")
# 获取元素内部 HTML
area_html = await page.inner_html(".job-list")
# 获取元素外部 HTML(包含自身标签)
outer = await page.evaluate("document.querySelector('.job-item').outerHTML")
# 获取文本内容
text = await page.inner_text(".title")
# 获取页面标题
title = await page.title()
```
### 3. 选择器与元素定位
```python
# 单个元素
element = await page.query_selector(".job-item") # 返回 ElementHandle 或 None
# 多个元素
elements = await page.query_selector_all(".job-item") # 返回列表
count = len(elements)
# 检查元素是否存在
if await page.query_selector(".job-item"):
print("存在")
# Locator API(推荐,更稳定)
locator = page.locator(".job-item")
count = await locator.count()
first = locator.first
nth = locator.nth(2) # 第3个元素
```
### 4. 点击与交互
```python
# 点击元素
await page.click(".job-item")
# 点击第一个匹配的元素
await page.locator(".job-item").first.click()
# 点击第 N 个元素
await page.locator(".job-item").nth(0).click()
# 带等待的点击
await page.click(".job-item", timeout=5000) # 最多等 5 秒
# 输入文本
await page.fill("input[name='search']", "关键词")
# 按键
await page.keyboard.press("Enter")
```
### 5. 等待
```python
import asyncio
# 简单等待(秒)
await asyncio.sleep(2)
# 等待选择器出现
await page.wait_for_selector(".job-list", timeout=10000)
# 等待选择器消失
await page.wait_for_selector(".loading", state="hidden")
# 等待导航完成
async with page.expect_navigation():
await page.click(".next-page")
# 等待网络空闲
await page.wait_for_load_state("networkidle")
```
### 6. 执行 JavaScript
```python
# 简单表达式
result = await page.evaluate("document.title")
# 带参数
selector = ".job-item"
count = await page.evaluate(f"document.querySelectorAll('{selector}').length")
# 复杂逻辑
result = await page.evaluate("""
() => {
const items = document.querySelectorAll('.job-item');
return Array.from(items).map(el => el.outerHTML);
}
""")
# 在元素上执行
element = await page.query_selector(".job-item")
html = await element.evaluate("el => el.outerHTML")
```
### 7. 截图
```python
# 整页截图
await page.screenshot(path="screenshot.png")
# 元素截图
element = await page.query_selector(".job-list")
await element.screenshot(path="element.png")
# 全页面(包括滚动区域)
await page.screenshot(path="full.png", full_page=True)
```
## 常用模式
### 模式1:获取多个元素的 HTML
```python
# 方法1evaluate
htmls = await page.evaluate("""
selector => {
const items = document.querySelectorAll(selector);
return Array.from(items).slice(0, 3).map(el => el.outerHTML);
}
""", ".job-item")
# 方法2:遍历 ElementHandle
elements = await page.query_selector_all(".job-item")
htmls = []
for el in elements[:3]:
html = await el.evaluate("el => el.outerHTML")
htmls.append(html)
```
### 模式2:检测页面变化
```python
before_url = page.url
await page.click(".job-item")
await asyncio.sleep(2)
after_url = page.url
if after_url != before_url:
print("发生了跳转")
```
### 模式3:带重试的选择器验证
```python
async def validate_selector(page, selector: str) -> int:
"""验证选择器,返回匹配数量"""
try:
elements = await page.query_selector_all(selector)
return len(elements)
except Exception:
return 0
```
### 模式4:安全获取元素内容
```python
async def safe_inner_html(page, selector: str) -> str | None:
"""安全获取元素 innerHTML"""
element = await page.query_selector(selector)
if element:
return await element.inner_html()
return None
```
## 浏览器配置
```python
# 有头模式(可见)
browser = await p.chromium.launch(headless=False)
# 无头模式(后台运行)
browser = await p.chromium.launch(headless=True)
# 慢动作(调试用)
browser = await p.chromium.launch(headless=False, slow_mo=500) # 每步慢 500ms
# 设置窗口大小
context = await browser.new_context(viewport={"width": 1280, "height": 720})
page = await context.new_page()
# 设置 User-Agent
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
```
## 错误处理
```python
from playwright.async_api import TimeoutError as PlaywrightTimeout
try:
await page.click(".not-exist", timeout=3000)
except PlaywrightTimeout:
print("元素未找到")
except Exception as e:
print(f"其他错误: {e}")
```
## 项目中的典型用法
```python
from playwright.async_api import async_playwright
async def analyze_page(url: str):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
try:
# 1. 访问页面
await page.goto(url, wait_until="networkidle")
# 2. 获取 HTML
html = await page.inner_html("body")
# 3. 验证选择器
selector = ".job-item"
elements = await page.query_selector_all(selector)
print(f"匹配到 {len(elements)} 个元素")
# 4. 点击验证
if elements:
before_url = page.url
await elements[0].click()
await asyncio.sleep(2)
after_url = page.url
print(f"跳转: {before_url} -> {after_url}")
finally:
await browser.close()
```
## 与 MCP 对比
| 操作 | MCP 方式 | 直接 Playwright |
|------|----------|-----------------|
| 导航 | `await call_tool(tools, "browser_navigate", url=url)` | `await page.goto(url)` |
| 获取 HTML | `await call_tool(tools, "browser_evaluate", function="document.body.innerHTML")` | `await page.inner_html("body")` |
| 点击 | `await call_tool(tools, "browser_evaluate", function="document.querySelector('.x').click()")` | `await page.click(".x")` |
| 获取 URL | `await call_tool(tools, "browser_evaluate", function="window.location.href")` | `page.url` |
直接用 Playwright 代码更简洁、更 Pythonic。