generated from kgod/ai-review-template
7.2 KiB
7.2 KiB
Playwright Python 使用指南
本文档覆盖项目中会用到的 Playwright 核心功能。
安装
pip install playwright
playwright install chromium # 安装浏览器
基本结构
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
# 启动浏览器
browser = await p.chromium.launch(headless=False) # False 可看到浏览器界面
# 创建页面
page = await browser.new_page()
# 操作页面...
# 关闭
await browser.close()
核心操作
1. 导航
# 访问 URL
await page.goto("https://example.com")
# 等待加载完成(可选策略)
await page.goto("https://example.com", wait_until="networkidle") # 网络空闲
await page.goto("https://example.com", wait_until="domcontentloaded") # DOM 加载完成
# 获取当前 URL
current_url = page.url
2. 获取内容
# 获取整个页面 HTML
html = await page.content()
# 获取 body 内部 HTML
body_html = await page.inner_html("body")
# 获取元素内部 HTML
area_html = await page.inner_html(".job-list")
# 获取元素外部 HTML(包含自身标签)
outer = await page.evaluate("document.querySelector('.job-item').outerHTML")
# 获取文本内容
text = await page.inner_text(".title")
# 获取页面标题
title = await page.title()
3. 选择器与元素定位
# 单个元素
element = await page.query_selector(".job-item") # 返回 ElementHandle 或 None
# 多个元素
elements = await page.query_selector_all(".job-item") # 返回列表
count = len(elements)
# 检查元素是否存在
if await page.query_selector(".job-item"):
print("存在")
# Locator API(推荐,更稳定)
locator = page.locator(".job-item")
count = await locator.count()
first = locator.first
nth = locator.nth(2) # 第3个元素
4. 点击与交互
# 点击元素
await page.click(".job-item")
# 点击第一个匹配的元素
await page.locator(".job-item").first.click()
# 点击第 N 个元素
await page.locator(".job-item").nth(0).click()
# 带等待的点击
await page.click(".job-item", timeout=5000) # 最多等 5 秒
# 输入文本
await page.fill("input[name='search']", "关键词")
# 按键
await page.keyboard.press("Enter")
5. 等待
import asyncio
# 简单等待(秒)
await asyncio.sleep(2)
# 等待选择器出现
await page.wait_for_selector(".job-list", timeout=10000)
# 等待选择器消失
await page.wait_for_selector(".loading", state="hidden")
# 等待导航完成
async with page.expect_navigation():
await page.click(".next-page")
# 等待网络空闲
await page.wait_for_load_state("networkidle")
6. 执行 JavaScript
# 简单表达式
result = await page.evaluate("document.title")
# 带参数
selector = ".job-item"
count = await page.evaluate(f"document.querySelectorAll('{selector}').length")
# 复杂逻辑
result = await page.evaluate("""
() => {
const items = document.querySelectorAll('.job-item');
return Array.from(items).map(el => el.outerHTML);
}
""")
# 在元素上执行
element = await page.query_selector(".job-item")
html = await element.evaluate("el => el.outerHTML")
7. 截图
# 整页截图
await page.screenshot(path="screenshot.png")
# 元素截图
element = await page.query_selector(".job-list")
await element.screenshot(path="element.png")
# 全页面(包括滚动区域)
await page.screenshot(path="full.png", full_page=True)
常用模式
模式1:获取多个元素的 HTML
# 方法1:evaluate
htmls = await page.evaluate("""
selector => {
const items = document.querySelectorAll(selector);
return Array.from(items).slice(0, 3).map(el => el.outerHTML);
}
""", ".job-item")
# 方法2:遍历 ElementHandle
elements = await page.query_selector_all(".job-item")
htmls = []
for el in elements[:3]:
html = await el.evaluate("el => el.outerHTML")
htmls.append(html)
模式2:检测页面变化
before_url = page.url
await page.click(".job-item")
await asyncio.sleep(2)
after_url = page.url
if after_url != before_url:
print("发生了跳转")
模式3:带重试的选择器验证
async def validate_selector(page, selector: str) -> int:
"""验证选择器,返回匹配数量"""
try:
elements = await page.query_selector_all(selector)
return len(elements)
except Exception:
return 0
模式4:安全获取元素内容
async def safe_inner_html(page, selector: str) -> str | None:
"""安全获取元素 innerHTML"""
element = await page.query_selector(selector)
if element:
return await element.inner_html()
return None
浏览器配置
# 有头模式(可见)
browser = await p.chromium.launch(headless=False)
# 无头模式(后台运行)
browser = await p.chromium.launch(headless=True)
# 慢动作(调试用)
browser = await p.chromium.launch(headless=False, slow_mo=500) # 每步慢 500ms
# 设置窗口大小
context = await browser.new_context(viewport={"width": 1280, "height": 720})
page = await context.new_page()
# 设置 User-Agent
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
错误处理
from playwright.async_api import TimeoutError as PlaywrightTimeout
try:
await page.click(".not-exist", timeout=3000)
except PlaywrightTimeout:
print("元素未找到")
except Exception as e:
print(f"其他错误: {e}")
项目中的典型用法
from playwright.async_api import async_playwright
async def analyze_page(url: str):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
page = await browser.new_page()
try:
# 1. 访问页面
await page.goto(url, wait_until="networkidle")
# 2. 获取 HTML
html = await page.inner_html("body")
# 3. 验证选择器
selector = ".job-item"
elements = await page.query_selector_all(selector)
print(f"匹配到 {len(elements)} 个元素")
# 4. 点击验证
if elements:
before_url = page.url
await elements[0].click()
await asyncio.sleep(2)
after_url = page.url
print(f"跳转: {before_url} -> {after_url}")
finally:
await browser.close()
与 MCP 对比
| 操作 | MCP 方式 | 直接 Playwright |
|---|---|---|
| 导航 | await call_tool(tools, "browser_navigate", url=url) |
await page.goto(url) |
| 获取 HTML | await call_tool(tools, "browser_evaluate", function="document.body.innerHTML") |
await page.inner_html("body") |
| 点击 | await call_tool(tools, "browser_evaluate", function="document.querySelector('.x').click()") |
await page.click(".x") |
| 获取 URL | await call_tool(tools, "browser_evaluate", function="window.location.href") |
page.url |
直接用 Playwright 代码更简洁、更 Pythonic。