最近在学习csv格式和json写入格式的不同,数据来源于某个直播网站。 此文章只用于研究学习请勿用作商业用途,本作者对使用的后果不承担责任,侵删。
from selenium import webdriver
import time
import csv
class DouyuSpider(object):
def __init__(self):
self.http_url = "https://www.douyu.com/directory/all"
# 信息
self.douyu_data = []
self.titile_list = []
self.tag_list = []
self.name_list = []
self.hot_list = []
self.href_list = []
def parse_http_url(self):
# 获取浏览器对象
driver = webdriver.Chrome()
# 访问网址
driver.get(self.http_url)
# 窗口最大化
driver.maximize_window()
# 等待网页加载
time.sleep(1)
# 滚动条
driver.execute_script('window.scrollTo(300,1000)')
time.sleep(2)
# 获得li元素
li_list = driver.find_elements_by_xpath(
'//div[@class="layout-Module-container layout-Cover ListContent"]/ul[@class="layout-Cover-list"]/li')
for li in li_list:
# 标题
title = li.find_element_by_xpath('.//h3[@class="DyListCover-intro"]').text
self.titile_list.append(title)
# 分类标签
category_tag = li.find_element_by_xpath('.//span[@class="DyListCover-zone"]').text
self.tag_list.append(category_tag)
# 主播名字
name = li.find_element_by_xpath('.//div[@class="DyListCover-userName"]').text
self.name_list.append(name)
# 热度
popularity = li.find_element_by_xpath('.//span[@class="DyListCover-hot"]').text
self.hot_list.append(popularity)
# 直播间地址
href = li.find_element_by_xpath('.//a[@class="DyListCover-wrap"]').get_attribute("href")
self.href_list.append(href)
print(f"{name}的信息已经爬取完毕...")
# 自动关闭
driver.quit()
return self.titile_list, self.tag_list, self.name_list, self.hot_list, self.href_list
def sava_csvdata_by_douyu(self, title, category_tag, name, popularity, href):
rows = zip(title, category_tag, name, popularity, href)
with open("斗鱼直播信息2-16.csv", "w", encoding="utf-8", newline="") as f:
f.write("标题,分类,主播名,热度,直播间地址\n")
writer = csv.writer(f)
for row in rows:
writer.writerow(row)
print("数据保存成功")
def run(self):
json_data = self.parse_http_url()
self.sava_csvdata_by_douyu(self.titile_list, self.tag_list, self.name_list, self.hot_list, self.href_list)
if __name__ == '__main__':
spider = DouyuSpider()
spider.run()
```
```