爬虫 - 豆瓣网电影排名

153 阅读2分钟

说明

代码可以直接跑,根据情况调整爬虫的headers

分析

第一页url 在这里插入图片描述 第二页url 在这里插入图片描述

结论

通过上面两张图片可以发现 第1页url==>......com/top250?start=0&filter= 第2页url==>......com/top250?start=25&filter=

由此类推 第n页url==>......com/top250?start=(页数-1)×25&filter=

爬虫代码

# -*- coding: utf-8 -*-

import csv
import os

import requests
from lxml import etree  # 使用xpath解析


class DoubanSpider:
    # 启动爬虫
    def start(self, csv_file):
        page_datas = []
        for i in range(5):  # 这里只爬取前5页
            page_url = 'https://movie.douban.com/top250?start={}&filter='.format(i * 25)
            page_data = self.crawl_page(page_url)
            page_datas += page_data
        self.to_csv(csv_file, page_datas)

    # 发送请求,获取响应
    def parse_url(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50',
            # 'Cookie': 'll="118254"; bid=RIhjY7AlJbs; douban-fav-remind=1; __gads=ID=bfe69b82370282bb-229837d694d90037:T=1675934006:RT=1675934006:S=ALNI_Mahus_xgvAibjc4RNKlnY6dN9tMGA; __utma=30149280.1495218227.1675934007.1676251768.1677216516.3; __utmc=30149280; __utmz=30149280.1677216516.3.3.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; dbcl2="268075904:/kDg6FUIgCI"; ck=LZF8; __utmc=223695111; push_noty_num=0; push_doumail_num=0; __yadk_uid=gEaCtmKAnjj4QO2HbEY2yk4O7MwyAUsi; __gpi=UID=00000bbea0607c83:T=1675934006:RT=1677217112:S=ALNI_MbAOnOTXqaxehJ5PNgi7ZMAOnYN1g; frodotk_db="23c223ac6e6535146b466b054b2f9f3c"; __utmv=30149280.26807; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1677219842%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=223695111.2130825506.1677217112.1677217112.1677219842.2; __utmb=223695111.0.10.1677219842; __utmz=223695111.1677219842.2.2.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmb=30149280.7.10.1677216516; ap_v=0,6.0; _pk_id.100001.4cf6=7f9ac36213a6b564.1677217111.2.1677220720.1677217951.'
        }
        response = requests.get(url, headers=headers)
        return response

    #  爬取每一页
    def crawl_page(self, url):
        response = self.parse_url(url)
        tree = etree.HTML(response.text)
        lis = tree.xpath('//ol/li')
        page_data = []
        for li in lis:
            title = li.xpath('.//div[@class="hd"]/a/span[1]/text()')[0]
            link = li.xpath('.//div[@class="hd"]/a/@href')[0]
            score = li.xpath('.//div[@class="star"]/span[2]/text()')[0]
            info = title, score, link
            print('抓到了数据:{}'.format(info))
            page_data.append(info)
        return page_data

    # 保存为csv文件
    def to_csv(self, csv_file, datas: list):
        with open(csv_file, 'w', newline='') as f:
            w = csv.writer(f)
            w.writerow(['电影名称', '评分', '链接'])  # 写入第一行
            w.writerows(datas)
        print('成功下载到:{}'.format(csv_file))


if __name__ == '__main__':
    # 保存到自定义路径下
    # csv_file = r'D:\豆瓣排名.csv'

    # 保存到桌面路径下
    desktop = os.path.join(os.path.expanduser("~"), 'Desktop')  # 获取桌面路径
    csv_file = os.path.join(desktop, '豆瓣排名.csv')  # csv文件的路径

    DoubanSpider().start(csv_file)

输出展示

在这里插入图片描述

csv文件展示

在这里插入图片描述