前言: 在数据为王的时代, 都想拿到海量数据通过层层包装转化成付费产品, 所以数据有多重要不言而喻, 所以尝试一下
看效果:

上代码:
from DrissionPage import ChromiumPage
import time
import json
import asyncio
page = ChromiumPage()
page.get('http://epub.cnipa.gov.cn/Advanced')
page.wait.load_start()
page.ele('#ad_begin').input('2023.05.10')
page.ele('#ad_end').input('2024.05.10')
page.ele('.icon-sea').click()
page.wait.load_start()
page.ele('#sizeSelect').click()
page.ele('@value=10').click()
page.wait.load_start()
async def collect_titles_from_pages(pagecur, num_pages=100):
my_list = []
ele_click = 0
for i in range(num_pages):
pagecur.wait.load_start()
sub_gbhs = pagecur.eles('申请公布号:')
sub_gurs = pagecur.eles('申请公布日:')
sub_sqhs = pagecur.eles('申请号:')
sub_sqds = pagecur.eles('申请日:')
sub_sqrs = pagecur.eles('申请人:')
sub_fmrs = pagecur.eles('发明人:')
sub_dzs = pagecur.eles('地址:')
sub_zys = pagecur.eles('摘要:')
titles = pagecur.eles('.title')
await asyncio.sleep(2)
for ix in range(len(titles)):
my_list.append({
"title": titles[ix].text,
"sub_gbh": sub_gbhs[ix].next().text,
"sub_gur": sub_gurs[ix].next().text,
"sub_sqh": sub_sqhs[ix].next().text,
"sub_sqd": sub_sqds[ix].next().text,
"sub_sqr": sub_sqrs[ix].next().text,
"sub_fmr": sub_fmrs[ix].next().text,
"sub_dz": sub_dzs[ix].next().text,
"sub_zy": sub_zys[ix].next().text,
})
ele_click = 1
try:
if ele_click == 0:
await asyncio.sleep(5)
if ele_click == 1:
next_page_element = pagecur.ele('.next_page')
if next_page_element:
next_page_element.click()
ele_click = 0
else:
break
except Exception as e:
print(f"Error clicking next page: {e}")
break
return my_list
my_list = asyncio.run(collect_titles_from_pages(page, 2))
print(my_list)
with open("patent.json", "w", encoding="utf-8") as f:
json.dump(my_list, f, ensure_ascii=False, indent=4)
爬数据:

束语: 谁说前端只能切图仔, 只要你愿意学习, 没有什么能难倒你。