Files
post_crawler/docs/playwright_guide.md
T
2026-05-26 21:02:17 +08:00

7.2 KiB
Raw Blame History

Playwright Python 使用指南

本文档覆盖项目中会用到的 Playwright 核心功能。

安装

pip install playwright
playwright install chromium  # 安装浏览器

基本结构

from playwright.async_api import async_playwright

async def main():
    async with async_playwright() as p:
        # 启动浏览器
        browser = await p.chromium.launch(headless=False)  # False 可看到浏览器界面
        
        # 创建页面
        page = await browser.new_page()
        
        # 操作页面...
        
        # 关闭
        await browser.close()

核心操作

1. 导航

# 访问 URL
await page.goto("https://example.com")

# 等待加载完成(可选策略)
await page.goto("https://example.com", wait_until="networkidle")  # 网络空闲
await page.goto("https://example.com", wait_until="domcontentloaded")  # DOM 加载完成

# 获取当前 URL
current_url = page.url

2. 获取内容

# 获取整个页面 HTML
html = await page.content()

# 获取 body 内部 HTML
body_html = await page.inner_html("body")

# 获取元素内部 HTML
area_html = await page.inner_html(".job-list")

# 获取元素外部 HTML(包含自身标签)
outer = await page.evaluate("document.querySelector('.job-item').outerHTML")

# 获取文本内容
text = await page.inner_text(".title")

# 获取页面标题
title = await page.title()

3. 选择器与元素定位

# 单个元素
element = await page.query_selector(".job-item")  # 返回 ElementHandle 或 None

# 多个元素
elements = await page.query_selector_all(".job-item")  # 返回列表
count = len(elements)

# 检查元素是否存在
if await page.query_selector(".job-item"):
    print("存在")

# Locator API(推荐,更稳定)
locator = page.locator(".job-item")
count = await locator.count()
first = locator.first
nth = locator.nth(2)  # 第3个元素

4. 点击与交互

# 点击元素
await page.click(".job-item")

# 点击第一个匹配的元素
await page.locator(".job-item").first.click()

# 点击第 N 个元素
await page.locator(".job-item").nth(0).click()

# 带等待的点击
await page.click(".job-item", timeout=5000)  # 最多等 5 秒

# 输入文本
await page.fill("input[name='search']", "关键词")

# 按键
await page.keyboard.press("Enter")

5. 等待

import asyncio

# 简单等待(秒)
await asyncio.sleep(2)

# 等待选择器出现
await page.wait_for_selector(".job-list", timeout=10000)

# 等待选择器消失
await page.wait_for_selector(".loading", state="hidden")

# 等待导航完成
async with page.expect_navigation():
    await page.click(".next-page")

# 等待网络空闲
await page.wait_for_load_state("networkidle")

6. 执行 JavaScript

# 简单表达式
result = await page.evaluate("document.title")

# 带参数
selector = ".job-item"
count = await page.evaluate(f"document.querySelectorAll('{selector}').length")

# 复杂逻辑
result = await page.evaluate("""
    () => {
        const items = document.querySelectorAll('.job-item');
        return Array.from(items).map(el => el.outerHTML);
    }
""")

# 在元素上执行
element = await page.query_selector(".job-item")
html = await element.evaluate("el => el.outerHTML")

7. 截图

# 整页截图
await page.screenshot(path="screenshot.png")

# 元素截图
element = await page.query_selector(".job-list")
await element.screenshot(path="element.png")

# 全页面(包括滚动区域)
await page.screenshot(path="full.png", full_page=True)

常用模式

模式1:获取多个元素的 HTML

# 方法1evaluate
htmls = await page.evaluate("""
    selector => {
        const items = document.querySelectorAll(selector);
        return Array.from(items).slice(0, 3).map(el => el.outerHTML);
    }
""", ".job-item")

# 方法2:遍历 ElementHandle
elements = await page.query_selector_all(".job-item")
htmls = []
for el in elements[:3]:
    html = await el.evaluate("el => el.outerHTML")
    htmls.append(html)

模式2:检测页面变化

before_url = page.url

await page.click(".job-item")
await asyncio.sleep(2)

after_url = page.url

if after_url != before_url:
    print("发生了跳转")

模式3:带重试的选择器验证

async def validate_selector(page, selector: str) -> int:
    """验证选择器,返回匹配数量"""
    try:
        elements = await page.query_selector_all(selector)
        return len(elements)
    except Exception:
        return 0

模式4:安全获取元素内容

async def safe_inner_html(page, selector: str) -> str | None:
    """安全获取元素 innerHTML"""
    element = await page.query_selector(selector)
    if element:
        return await element.inner_html()
    return None

浏览器配置

# 有头模式(可见)
browser = await p.chromium.launch(headless=False)

# 无头模式(后台运行)
browser = await p.chromium.launch(headless=True)

# 慢动作(调试用)
browser = await p.chromium.launch(headless=False, slow_mo=500)  # 每步慢 500ms

# 设置窗口大小
context = await browser.new_context(viewport={"width": 1280, "height": 720})
page = await context.new_page()

# 设置 User-Agent
context = await browser.new_context(
    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)

错误处理

from playwright.async_api import TimeoutError as PlaywrightTimeout

try:
    await page.click(".not-exist", timeout=3000)
except PlaywrightTimeout:
    print("元素未找到")
except Exception as e:
    print(f"其他错误: {e}")

项目中的典型用法

from playwright.async_api import async_playwright

async def analyze_page(url: str):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        
        try:
            # 1. 访问页面
            await page.goto(url, wait_until="networkidle")
            
            # 2. 获取 HTML
            html = await page.inner_html("body")
            
            # 3. 验证选择器
            selector = ".job-item"
            elements = await page.query_selector_all(selector)
            print(f"匹配到 {len(elements)} 个元素")
            
            # 4. 点击验证
            if elements:
                before_url = page.url
                await elements[0].click()
                await asyncio.sleep(2)
                after_url = page.url
                print(f"跳转: {before_url} -> {after_url}")
            
        finally:
            await browser.close()

与 MCP 对比

操作 MCP 方式 直接 Playwright
导航 await call_tool(tools, "browser_navigate", url=url) await page.goto(url)
获取 HTML await call_tool(tools, "browser_evaluate", function="document.body.innerHTML") await page.inner_html("body")
点击 await call_tool(tools, "browser_evaluate", function="document.querySelector('.x').click()") await page.click(".x")
获取 URL await call_tool(tools, "browser_evaluate", function="window.location.href") page.url

直接用 Playwright 代码更简洁、更 Pythonic。