python爬虫之dou鱼信息获取(csv格式写入数据)

166 阅读1分钟

最近在学习csv格式和json写入格式的不同,数据来源于某个直播网站。 此文章只用于研究学习请勿用作商业用途,本作者对使用的后果不承担责任,侵删。

from selenium import webdriver
import time
import csv


class DouyuSpider(object):
    def __init__(self):
        self.http_url = "https://www.douyu.com/directory/all"
        # 信息
        self.douyu_data = []
        self.titile_list = []
        self.tag_list = []
        self.name_list = []
        self.hot_list = []
        self.href_list = []

    def parse_http_url(self):
        # 获取浏览器对象
        driver = webdriver.Chrome()
        # 访问网址
        driver.get(self.http_url)
        # 窗口最大化
        driver.maximize_window()
        # 等待网页加载
        time.sleep(1)
        # 滚动条
        driver.execute_script('window.scrollTo(300,1000)')
        time.sleep(2)
        # 获得li元素
        li_list = driver.find_elements_by_xpath(
            '//div[@class="layout-Module-container layout-Cover ListContent"]/ul[@class="layout-Cover-list"]/li')

        for li in li_list:
            # 标题
            title = li.find_element_by_xpath('.//h3[@class="DyListCover-intro"]').text
            self.titile_list.append(title)
            # 分类标签
            category_tag = li.find_element_by_xpath('.//span[@class="DyListCover-zone"]').text
            self.tag_list.append(category_tag)
            # 主播名字
            name = li.find_element_by_xpath('.//div[@class="DyListCover-userName"]').text
            self.name_list.append(name)
            # 热度
            popularity = li.find_element_by_xpath('.//span[@class="DyListCover-hot"]').text
            self.hot_list.append(popularity)
            # 直播间地址
            href = li.find_element_by_xpath('.//a[@class="DyListCover-wrap"]').get_attribute("href")
            self.href_list.append(href)
            print(f"{name}的信息已经爬取完毕...")
        # 自动关闭
        driver.quit()
        return self.titile_list, self.tag_list, self.name_list, self.hot_list, self.href_list

    def sava_csvdata_by_douyu(self, title, category_tag, name, popularity, href):
        rows = zip(title, category_tag, name, popularity, href)
        with open("斗鱼直播信息2-16.csv", "w", encoding="utf-8", newline="") as f:
            f.write("标题,分类,主播名,热度,直播间地址\n")
            writer = csv.writer(f)
            for row in rows:
                writer.writerow(row)
        print("数据保存成功")

    def run(self):
        json_data = self.parse_http_url()
        self.sava_csvdata_by_douyu(self.titile_list, self.tag_list, self.name_list, self.hot_list, self.href_list)


if __name__ == '__main__':
    spider = DouyuSpider()
    spider.run()
```
```