# Playwright Python 使用指南 本文档覆盖项目中会用到的 Playwright 核心功能。 ## 安装 ```bash pip install playwright playwright install chromium # 安装浏览器 ``` ## 基本结构 ```python from playwright.async_api import async_playwright async def main(): async with async_playwright() as p: # 启动浏览器 browser = await p.chromium.launch(headless=False) # False 可看到浏览器界面 # 创建页面 page = await browser.new_page() # 操作页面... # 关闭 await browser.close() ``` ## 核心操作 ### 1. 导航 ```python # 访问 URL await page.goto("https://example.com") # 等待加载完成(可选策略) await page.goto("https://example.com", wait_until="networkidle") # 网络空闲 await page.goto("https://example.com", wait_until="domcontentloaded") # DOM 加载完成 # 获取当前 URL current_url = page.url ``` ### 2. 获取内容 ```python # 获取整个页面 HTML html = await page.content() # 获取 body 内部 HTML body_html = await page.inner_html("body") # 获取元素内部 HTML area_html = await page.inner_html(".job-list") # 获取元素外部 HTML(包含自身标签) outer = await page.evaluate("document.querySelector('.job-item').outerHTML") # 获取文本内容 text = await page.inner_text(".title") # 获取页面标题 title = await page.title() ``` ### 3. 选择器与元素定位 ```python # 单个元素 element = await page.query_selector(".job-item") # 返回 ElementHandle 或 None # 多个元素 elements = await page.query_selector_all(".job-item") # 返回列表 count = len(elements) # 检查元素是否存在 if await page.query_selector(".job-item"): print("存在") # Locator API(推荐,更稳定) locator = page.locator(".job-item") count = await locator.count() first = locator.first nth = locator.nth(2) # 第3个元素 ``` ### 4. 点击与交互 ```python # 点击元素 await page.click(".job-item") # 点击第一个匹配的元素 await page.locator(".job-item").first.click() # 点击第 N 个元素 await page.locator(".job-item").nth(0).click() # 带等待的点击 await page.click(".job-item", timeout=5000) # 最多等 5 秒 # 输入文本 await page.fill("input[name='search']", "关键词") # 按键 await page.keyboard.press("Enter") ``` ### 5. 等待 ```python import asyncio # 简单等待(秒) await asyncio.sleep(2) # 等待选择器出现 await page.wait_for_selector(".job-list", timeout=10000) # 等待选择器消失 await page.wait_for_selector(".loading", state="hidden") # 等待导航完成 async with page.expect_navigation(): await page.click(".next-page") # 等待网络空闲 await page.wait_for_load_state("networkidle") ``` ### 6. 执行 JavaScript ```python # 简单表达式 result = await page.evaluate("document.title") # 带参数 selector = ".job-item" count = await page.evaluate(f"document.querySelectorAll('{selector}').length") # 复杂逻辑 result = await page.evaluate(""" () => { const items = document.querySelectorAll('.job-item'); return Array.from(items).map(el => el.outerHTML); } """) # 在元素上执行 element = await page.query_selector(".job-item") html = await element.evaluate("el => el.outerHTML") ``` ### 7. 截图 ```python # 整页截图 await page.screenshot(path="screenshot.png") # 元素截图 element = await page.query_selector(".job-list") await element.screenshot(path="element.png") # 全页面(包括滚动区域) await page.screenshot(path="full.png", full_page=True) ``` ## 常用模式 ### 模式1:获取多个元素的 HTML ```python # 方法1:evaluate htmls = await page.evaluate(""" selector => { const items = document.querySelectorAll(selector); return Array.from(items).slice(0, 3).map(el => el.outerHTML); } """, ".job-item") # 方法2:遍历 ElementHandle elements = await page.query_selector_all(".job-item") htmls = [] for el in elements[:3]: html = await el.evaluate("el => el.outerHTML") htmls.append(html) ``` ### 模式2:检测页面变化 ```python before_url = page.url await page.click(".job-item") await asyncio.sleep(2) after_url = page.url if after_url != before_url: print("发生了跳转") ``` ### 模式3:带重试的选择器验证 ```python async def validate_selector(page, selector: str) -> int: """验证选择器,返回匹配数量""" try: elements = await page.query_selector_all(selector) return len(elements) except Exception: return 0 ``` ### 模式4:安全获取元素内容 ```python async def safe_inner_html(page, selector: str) -> str | None: """安全获取元素 innerHTML""" element = await page.query_selector(selector) if element: return await element.inner_html() return None ``` ## 浏览器配置 ```python # 有头模式(可见) browser = await p.chromium.launch(headless=False) # 无头模式(后台运行) browser = await p.chromium.launch(headless=True) # 慢动作(调试用) browser = await p.chromium.launch(headless=False, slow_mo=500) # 每步慢 500ms # 设置窗口大小 context = await browser.new_context(viewport={"width": 1280, "height": 720}) page = await context.new_page() # 设置 User-Agent context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" ) ``` ## 错误处理 ```python from playwright.async_api import TimeoutError as PlaywrightTimeout try: await page.click(".not-exist", timeout=3000) except PlaywrightTimeout: print("元素未找到") except Exception as e: print(f"其他错误: {e}") ``` ## 项目中的典型用法 ```python from playwright.async_api import async_playwright async def analyze_page(url: str): async with async_playwright() as p: browser = await p.chromium.launch(headless=False) page = await browser.new_page() try: # 1. 访问页面 await page.goto(url, wait_until="networkidle") # 2. 获取 HTML html = await page.inner_html("body") # 3. 验证选择器 selector = ".job-item" elements = await page.query_selector_all(selector) print(f"匹配到 {len(elements)} 个元素") # 4. 点击验证 if elements: before_url = page.url await elements[0].click() await asyncio.sleep(2) after_url = page.url print(f"跳转: {before_url} -> {after_url}") finally: await browser.close() ``` ## 与 MCP 对比 | 操作 | MCP 方式 | 直接 Playwright | |------|----------|-----------------| | 导航 | `await call_tool(tools, "browser_navigate", url=url)` | `await page.goto(url)` | | 获取 HTML | `await call_tool(tools, "browser_evaluate", function="document.body.innerHTML")` | `await page.inner_html("body")` | | 点击 | `await call_tool(tools, "browser_evaluate", function="document.querySelector('.x').click()")` | `await page.click(".x")` | | 获取 URL | `await call_tool(tools, "browser_evaluate", function="window.location.href")` | `page.url` | 直接用 Playwright 代码更简洁、更 Pythonic。