Python简单爬虫示例

44 阅读1分钟

引入依赖

import requests
from lxml import etree
from lxml import html

设置变量,循环拿取url

# 定义一个数组用于存储爬取的数据
dataList = []

url = 'https://bbs.hupu.com/lol-hot'
urls = []
for num in range(1, 11, 1):
    if num != 1:
        urls.append(url + '-' + str(num))
    else:
        urls.append(url)

定义方法爬取数据

def reqfunc(url):
    res = requests.get(url)
    res.encoding = 'UTF8'
    
    # 将HTML字符串解析为一个可以进一步操作的`Element`对象。
    # data = etree.HTML(res.text)
    data = html.fromstring(res.text)
    
    #使用Xpath获取制定标签下的标签集合
    item = data.xpath('.//div[@class="bbs-sl-web-post-layout"]')
    
    # 遍历数据集,再逐个爬取标签元素
    for i in item:
        title = i.xpath('./div[@class="post-title"]/a/text()')
        reply_browse = i.xpath('./div[@class="post-datum"]/text()')
        author = i.xpath('./div[@class="post-auth"]/a/text()')
        time = i.xpath('./div[@class="post-time"]/text()')
        dataList.append([''.join(title), ''.join(reply_browse), ''.join(author), ''.join(time)])

执行方法爬取数据

for url in urls:
    reqfunc(url)

保存为CSV文件

with open('lol24.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows([['标题', '回复/浏览', '作者', '时间']])
    writer.writerows(dataList)