引入依赖
import requests
from lxml import etree
from lxml import html
设置变量,循环拿取url
dataList = []
url = 'https://bbs.hupu.com/lol-hot'
urls = []
for num in range(1, 11, 1):
if num != 1:
urls.append(url + '-' + str(num))
else:
urls.append(url)
定义方法爬取数据
def reqfunc(url):
res = requests.get(url)
res.encoding = 'UTF8'
data = html.fromstring(res.text)
item = data.xpath('.//div[@class="bbs-sl-web-post-layout"]')
for i in item:
title = i.xpath('./div[@class="post-title"]/a/text()')
reply_browse = i.xpath('./div[@class="post-datum"]/text()')
author = i.xpath('./div[@class="post-auth"]/a/text()')
time = i.xpath('./div[@class="post-time"]/text()')
dataList.append([''.join(title), ''.join(reply_browse), ''.join(author), ''.join(time)])
执行方法爬取数据
for url in urls:
reqfunc(url)
保存为CSV文件
with open('lol24.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerows([['标题', '回复/浏览', '作者', '时间']])
writer.writerows(dataList)