一、前言
Playwright 是一款强大的浏览器自动化测试框架,能够支持主流的浏览器(如Chrome、Firefox、Edge等)的跨平台自动化操作与 Selenium 功能类似。它不仅能执行精细的用户操作模拟,还能够精确抓取网页内容,因此被广泛应用于数据抓取、自动化测试、和搜索引擎模拟等场景。
基于Playwright封装,构建一个通用的浏览器引擎接口,涵盖常见的搜索引擎操作(如谷歌、百度、Bing)和网页解析功能。通过对这些功能进行抽象和封装,开发者能够轻松实现网页搜索与数据提取,简化日常开发工作中的网页交互与数据处理任务。
Playwright python 文档:playwright.dev/python/docs…
二、安装与使用
pip install playwirght
安装浏览器驱动
playwirght install
简单使用
import asyncio
from playwright.async_api import async_playwright
async def main():
# 启动 Playwright 引擎
playwright_engine = await async_playwright().start()
# 启动浏览器,headless=False 表示有头模式(可见窗口)
browser = await playwright_engine.chromium.launch(headless=False)
# 打开一个新页面
page = await browser.new_page()
# 访问掘金首页
await page.goto("https://juejin.cn/")
await asyncio.sleep(2)
if __name__ == "__main__":
asyncio.run(main())
headless=False
方式会启动playwright安装好的浏览器驱动,运行效果如下
注意:在一些无gui环境的操作系统下例如 linux 等,请把 headless 设置为 True
三、功能封装
常用搜索引擎封装
浏览器最常见的功能之一就是联网搜索信息。为了模拟这一过程,可以通过访问搜索引擎的入口实现自动化的搜索操作。本工具将封装常用的搜索引擎(如谷歌、百度、Bing),通过模拟用户输入和搜索,来获取对应的搜索结果。
谷歌搜索
import asyncio
from pprint import pprint
from playwright.async_api import async_playwright
from src.tools.schemas import LinkInfo, WebPage
class BrowserEngine:
def __init__(self, headless=True, timeout=10, **launch_kwargs):
self.headless = headless
self.playwright_engine = None
self.browser = None
self.launch_kwargs = launch_kwargs
self.timeout = timeout # unit seconds
async def launch_browser(self):
if self.browser is None:
self.playwright_engine = await async_playwright().start()
self.browser = await self.playwright_engine.chromium.launch(
headless=self.headless,
timeout=self.timeout * 1000, # unit ms
**self.launch_kwargs,
)
async def close_browser(self):
if self.browser:
await self.browser.close()
await self.playwright_engine.stop()
self.browser = None
async def google_search(self, query, max_results=8) -> list[LinkInfo]:
await self.launch_browser()
page = await self.browser.new_page()
async with page:
# 打开Google主页
await page.goto(f"https://www.google.com/search?q={query}")
# 输入搜索词并执行搜索
# await page.fill("textarea[name='q']", query)
# await page.press("textarea[name='q']", "Enter")
# 等待搜索结果页面加载
el_selector = "div.MjjYud"
await page.wait_for_selector(el_selector)
# 获取搜索结果的标题和链接
results = await page.query_selector_all(el_selector)
search_results = await self.get_search_results(results, max_results, engine_type="google")
return search_results
async def _parse_google_results(self, results, max_results=8) -> list[LinkInfo]:
search_results = []
for result in results:
try:
# 获取标题和链接
title_element = await result.query_selector("h3")
if not title_element:
continue # 如果找不到标题,跳过这个结果
title = await title_element.text_content()
url = await title_element.evaluate("el => el.parentElement.href")
# 获取快照内容
snapshot_element = await result.query_selector("div.VwiC3b")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
if len(search_results) >= max_results:
break
search_results.append(LinkInfo(title=title, url=url, snapshot=snapshot))
except Exception as e:
print(e)
return search_results
async def get_search_results(self, results, max_results=8, engine_type="google"):
parse_method_mapping = {
"google": self._parse_google_results,
# "bing": self._parse_bing_results,
# "baidu": self._parse_baidu_results,
}
if engine_type not in parse_method_mapping:
raise ValueError(f"Engine type {engine_type} is not supported")
parse_method = parse_method_mapping[engine_type]
return await parse_method(results, max_results)
async def main():
engine = BrowserEngine(headless=True)
results = await engine.google_search("重试装饰器", max_results=5)
pprint(results)
await engine.close_browser()
if __name__ == "__main__":
asyncio.run(main())
搜索结果如下
封装思路如下
-
启动和关闭浏览器:首先封装了启动浏览器的逻辑,通过
launch_browser
方法启动浏览器实例,close_browser
方法则负责关闭浏览器并清理资源。 -
Google 搜索方法:
google_search
方法负责访问 Google 搜索引擎的入口,并模拟搜索行为。通过访问https://www.google.com/search?q={query}
,可以在 URL 中直接传递搜索关键词,避免在搜索框中输入和按键操作。 -
搜索结果解析:使用
get_search_results
方法,根据不同的搜索引擎选择对应的解析方法。针对 Google,使用_parse_google_results
方法提取搜索结果的标题、URL 和快照内容,返回LinkInfo
数据结构。 -
灵活扩展:使用
engine_type
参数支持未来扩展其他搜索引擎(如 Bing 或百度)
一开始我是在搜索框中输入和按键操作模拟搜索动作,但后面发现每个浏览器的入口元素都不一样写起来复杂,后面改成 get url的方式就简洁多了,这里重点介绍下 _parse_google_results
方法。
async def _parse_google_results(self, results, max_results=8) -> list[LinkInfo]:
search_results = []
for result in results:
try:
# 获取标题和链接
title_element = await result.query_selector("h3")
if not title_element:
continue # 如果找不到标题,跳过这个结果
title = await title_element.text_content()
url = await title_element.evaluate("el => el.parentElement.href")
# 获取快照内容
snapshot_element = await result.query_selector("div.VwiC3b")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
if len(search_results) >= max_results:
break
search_results.append(LinkInfo(title=title, url=url, snapshot=snapshot))
except Exception as e:
print(e)
return search_results
results
参数表示通过搜索后筛选出的 HTML 元素,这里是利用了前端的元素选择器来筛选页面中的特定内容。以 div.MjjYud
为例,它表示所有带有 MjjYud
类选择器的 div
元素。
el_selector = "div.MjjYud"
results = await page.query_selector_all(el_selector)
可以打开浏览器的开发者工具进行网页调试分析
通过这种方式,我们可以快速定位搜索结果的容器,并进一步解析其中的标题、链接和快照内容。然后封装成 LinkInfo
数据对象
from pydantic import BaseModel
class LinkInfo(BaseModel):
title: str
url: str
snapshot: str
以此类推后面的百度、Bing 搜索都是这种逻辑进行封装,只是页面结构不同定位的元素不一样而已,所以我就以google搜索为例进行介绍,后面的就直接贴代码就不一一赘述了。
百度搜索
import asyncio
from playwright.async_api import async_playwright
from src.tools.schemas import LinkInfo, WebPage
class BrowserEngine:
def __init__(self, headless=True, timeout=10, **launch_kwargs):
self.headless = headless
self.playwright_engine = None
self.browser = None
self.launch_kwargs = launch_kwargs
self.timeout = timeout # unit seconds
async def launch_browser(self):
if self.browser is None:
self.playwright_engine = await async_playwright().start()
self.browser = await self.playwright_engine.chromium.launch(
headless=self.headless,
timeout=self.timeout * 1000, # unit ms
**self.launch_kwargs,
)
async def close_browser(self):
if self.browser:
await self.browser.close()
await self.playwright_engine.stop()
self.browser = None
async def baidu_search(self, query, max_results=8) -> list[LinkInfo]:
await self.launch_browser()
page = await self.browser.new_page()
async with page:
# 打开百度主页
await page.goto(f"https://www.baidu.com/s?wd={query}")
# 等待搜索结果页面加载
el_selector = "div.c-container"
await page.wait_for_selector(el_selector)
# 获取搜索结果的标题和链接
results = await page.query_selector_all(el_selector)
search_results = await self.get_search_results(results, max_results, engine_type="baidu")
return search_results
async def _parse_baidu_results(self, results, max_results=8) -> list[LinkInfo]:
search_results = []
for result in results:
try:
# 获取标题和链接
title_element = await result.query_selector("h3 a")
if not title_element:
continue # 如果找不到标题,跳过这个结果
title = await title_element.text_content()
url = await title_element.get_attribute("href")
# 尝试获取带封面图的快照内容
snapshot_element = await result.query_selector("div.c-span9 span.content-right_2s-H4")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
# 如果没有封面图,尝试获取没有封面图的快照内容
if not snapshot:
snapshot_element = await result.query_selector("span.content-right_1THTn")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
if len(search_results) >= max_results:
break
search_results.append(LinkInfo(title=title, url=url, snapshot=snapshot))
except Exception as e:
print(e)
return search_results
async def get_search_results(self, results, max_results=8, engine_type="google"):
parse_method_mapping = {
# "google": self._parse_google_results,
# "bing": self._parse_bing_results,
"baidu": self._parse_baidu_results,
}
if engine_type not in parse_method_mapping:
raise ValueError(f"Engine type {engine_type} is not supported")
parse_method = parse_method_mapping[engine_type]
return await parse_method(results, max_results)
async def main():
engine = BrowserEngine(headless=True)
await engine.baidu_search("python异步框架大战", max_results=5)
await engine.close_browser()
if __name__ == "__main__":
asyncio.run(main())
Bing 搜索
import asyncio
from playwright.async_api import async_playwright
from src.tools.schemas import LinkInfo, WebPage
class BrowserEngine:
def __init__(self, headless=True, timeout=10, **launch_kwargs):
self.headless = headless
self.playwright_engine = None
self.browser = None
self.launch_kwargs = launch_kwargs
self.timeout = timeout # unit seconds
async def launch_browser(self):
if self.browser is None:
self.playwright_engine = await async_playwright().start()
self.browser = await self.playwright_engine.chromium.launch(
headless=self.headless,
timeout=self.timeout * 1000, # unit ms
**self.launch_kwargs,
)
async def close_browser(self):
if self.browser:
await self.browser.close()
await self.playwright_engine.stop()
self.browser = None
async def bing_search(self, query, max_results=8) -> list[LinkInfo]:
await self.launch_browser()
page = await self.browser.new_page()
async with page:
# 打开bing主页
await page.goto(f"https://www.bing.com/search?q={query}")
# 等待搜索结果页面加载完毕
el_selector = "li.b_algo"
await page.wait_for_selector(el_selector) # 确保搜索结果的链接已经加载
# 获取搜索结果的标题和链接
results = await page.query_selector_all(el_selector)
search_results = await self.get_search_results(results, max_results, engine_type="bing")
return search_results
async def _parse_bing_results(self, results, max_results=8) -> list[LinkInfo]:
search_results = []
for result in results:
try:
# 获取标题和链接
title_element = await result.query_selector("h2 a")
if not title_element:
continue # 如果找不到标题,跳过这个结果
title = await title_element.text_content()
url = await title_element.get_attribute("href")
# 获取快照内容
snapshot_element = await result.query_selector("div.b_caption")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
if len(search_results) >= max_results:
break
search_results.append(LinkInfo(title=title, url=url, snapshot=snapshot))
except Exception as e:
print(e)
return search_results
async def get_search_results(self, results, max_results=8, engine_type="google"):
parse_method_mapping = {
# "google": self._parse_google_results,
"bing": self._parse_bing_results,
# "baidu": self._parse_baidu_results,
}
if engine_type not in parse_method_mapping:
raise ValueError(f"Engine type {engine_type} is not supported")
parse_method = parse_method_mapping[engine_type]
return await parse_method(results, max_results)
async def main():
engine = BrowserEngine(headless=True)
await engine.bing_search("重试装饰器", max_results=5)
await engine.close_browser()
if __name__ == "__main__":
asyncio.run(main())
注意:这种浏览器搜索功能是不稳定的,因为数据是取决于网页渲染的结构,一旦发生改变则无法正确的获取数据,又要重新定位数据元素然后改代码。这里只是一个模拟的搜索功能封装。
网页内容获取
import asyncio
from playwright.async_api import async_playwright
from src.tools.schemas import LinkInfo, WebPage
class BrowserEngine:
def __init__(self, headless=True, timeout=10, **launch_kwargs):
self.headless = headless
self.playwright_engine = None
self.browser = None
self.launch_kwargs = launch_kwargs
self.timeout = timeout # unit seconds
async def launch_browser(self):
if self.browser is None:
self.playwright_engine = await async_playwright().start()
self.browser = await self.playwright_engine.chromium.launch(
headless=self.headless,
timeout=self.timeout * 1000, # unit ms
**self.launch_kwargs,
)
async def close_browser(self):
if self.browser:
await self.browser.close()
await self.playwright_engine.stop()
self.browser = None
def get_selector(self, content: str) -> str:
selector = "body"
if "juejin" in content and "article-area" in content:
# 掘金文章
selector = "div#juejin > div.view-container > main > div > div.main-area.article-area"
# todo 知乎、csdn、博客园 ...
return selector
async def _parse_page_content(self, page, url, selector: str = "body", timeout=None) -> WebPage:
timeout = timeout or self.timeout
timeout = timeout * 1000 # unit ms
async with page:
await page.goto(url)
content = await page.content()
if selector == "body":
# 根据content内容制定不同的selector,只获取网页相关的数据就没有那么多干扰项
selector = self.get_selector(content)
try:
inner_text = await page.inner_text(selector, timeout=timeout)
except Exception as e:
print(e)
inner_text = await page.inner_text("body")
return WebPage(url=url, content=content, inner_text=inner_text)
async def _fetch_page_content(self, url, selector="body", timeout=None) -> WebPage:
try:
await self.launch_browser()
page = await self.browser.new_page()
return await self._parse_page_content(page, url, selector, timeout)
except Exception:
print("e")
return WebPage(url="", content="", inner_text="")
async def fetch_page_content(self, urls: list, selector="body", timeout=None) -> tuple[WebPage]:
return await asyncio.gather(*[self._fetch_page_content(url, selector, timeout) for url in urls])
方法封装思路
-
get_selector(self, content: str) -> str
- 功能:根据网页内容返回相应的选择器。这个方法会根据网页的内容动态决定使用哪种选择器,以便精准抓取所需的内容,这个需要一开始就内置一些规则。
- 思路:通过分析网页的特征(例如掘金网页特有的
article-area
类),来确定提取数据的最佳选择器,确保提取的内容更准确。当前支持掘金文章的选择器,后续可扩展到其他平台如知乎、CSDN等。
-
_parse_page_content(self, page, url, selector: str = "body", timeout=None) -> WebPage
- 功能:解析网页内容。通过传入的 URL,获取网页的完整 HTML 内容,并根据提供的选择器提取网页中的特定部分。
- 思路:首先访问指定的 URL 获取网页内容,然后判断是否使用默认的选择器
body
。如果是,则调用get_selector
方法,根据网页内容自定义选择器。接着,使用该选择器提取网页的inner_text
(即网页中的主要文本内容)。该方法在失败时会尝试回退到默认的body
选择器,确保即便提取失败也有内容返回。
-
_fetch_page_content(self, url, selector="body", timeout=None) -> WebPage
- 功能:针对单个 URL 获取网页内容。调用
_parse_page_content
方法获取该 URL 对应网页的内容和提取的文本。 - 思路:这是
_parse_page_content
的封装,提供了一个更高层次的方法来针对单个 URL 进行网页抓取和内容提取。最终返回WebPage
对象,包含url
、content`
网页html内容、innet_text
网页内嵌文本内容。
- 功能:针对单个 URL 获取网页内容。调用
-
fetch_page_content(self, urls: list, selector="body", timeout=None) -> tuple[WebPage]
-
功能:并发处理多个 URL 的网页抓取任务,并返回结果。
-
思路:通过
asyncio.gather
实现对多个 URL 的并发处理,提升了抓取多个网页的效率。每个 URL 的抓取任务由_fetch_page_content
负责,通过此方法的并发执行,可以同时获取多个网页的内容并返回结果。
-
数据对象 WebPage
如下
class WebPage(BaseModel):
url: str
content: str
inner_text: str
def __repr__(self):
return f"WebPage(url={self.url}, content={self.content[:20]}, inner_text={self.inner_text[:20]})"
测试效果
async def main():
engine = BrowserEngine(headless=True)
results = await engine.google_search("重试装饰器", max_results=3)
# pprint(results)
# urls = ["https://juejin.cn/post/7283532551473725497"]
urls = [result.url for result in results]
web_pages = await engine.fetch_page_content(urls, timeout=1)
pprint(web_pages)
await engine.close_browser()
if __name__ == "__main__":
asyncio.run(main())
四、封装总结
playwright
工具封装主要实现了
常用搜索引擎(如 Google)的功能:能够获取搜索结果的标题、详情链接和快照内容。
获取网页内容:对于网页内容解析,支持通过链接获取完整网页内嵌文本内容、html内容,并支持多链接的并发处理。
未来,这两个功能可以与大语言模型结合,实现联网搜索、网页总结等更智能的功能,赋能大语言模型实时获取信息并生成精准的摘要。
五、源代码
AGI-Demo:AGI技术练习案例
Github:github.com/HuiDBK/AGI-…