『Python』如何使用 Pandas 获取“豆瓣电影”相关数据,并生成Excel表格

1,075 阅读2分钟

使用Python获取“豆瓣电影”相关数据,并生成Excel表格

本文使用了requestspandas第三方库,对豆瓣电影各相关数据进行爬取,并记录至Excel表格内。但发现存在以下三个问题:

  1. 生成的Excel表格,第一个sheet页为空
  2. 数据爬取数据过长
  3. 无法获取每个标签的具体电影数量,所以我取了一个吉祥的数字520,但请各位大佬指教,若为空,则直接停止,进入下一个标签的爬取

请各位大佬进行指教,并进行优化,小弟在此谢谢! 注:要是觉得文章写得不错,记得留个赞哦! 多余的sheet页 因为如果一开始不生成一个sheet页,则无法使用ExcelWriter的方法,小弟的解决方法是使用remove方法进行移除多余的sheet页,代码如下,请各位大佬指教!

# 移除创建表格时多余的sheet页
def del_excel(self):
	book = load_workbook(self.fileName)
	ws = book["Sheet1"]
	book.remove(ws)
	book.save(self.fileName)

爬取豆瓣电影相关数据的具体代码如下:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests as re
from openpyxl import load_workbook
import pandas as pd
import json


class DouBan:
    def __init__(self):

        self.sr1 = pd.DataFrame(columns=["评分", "电影名"], index=None)
        self.fileName = 'C:/Users/Administrator/Desktop/豆瓣电影评分.xlsx'
        # sheetName = '豆瓣电影评分'

        # 获取检索标签名
        self.tag_Movie = {
            0: "热门",
            1: "最新",
            2: "经典",
            3: "豆瓣高分",
            4: "冷门佳片",
            5: "华语",
            6: "欧美",
            7: "韩国",
            8: "日本",
            9: "动作",
            10: "喜剧",
            11: "爱情",
            12: "科技",
            13: "悬疑",
            14: "恐怖",
            15: "治愈"
        }

    def douban_movie(self):

        # 创建空文件
        self.sr1.to_excel(self.fileName, index=False, header=True)
        # 遍历检索标签,获取各标签的数据
        for tagNumber in range(0, 16):
            tag_Num = self.tag_Movie[tagNumber]
            # 将获取的数据暂存在sr3列表内
            sr3 = []
            for page_s in range(0, 20, 20):

                for l in self.movie_request(tag_Num, page_s):
                    movieDetails = self.movie_details(movieID=l['id'])
                    sr3.append({
                        '评分': l['rate'],
                        "电影名": l['title'],
                        "导演": json.dumps(movieDetails["directors"], indent=0, separators=('\n', ': '),
                                         ensure_ascii=False),
                        "主演": json.dumps(movieDetails["actors"], indent=0, separators=('\n', ': '), ensure_ascii=False),
                        "时长": movieDetails["duration"]
                    })

            self.excel_data(sr3, tag_Num)

        self.del_excel()  # 移除多余的sheet1页
        print("====数据获取成功====")

    # 移除创建表格时多余的sheet页
    def del_excel(self):

        book = load_workbook(self.fileName)
        ws = book["Sheet1"]
        book.remove(ws)
        book.save(self.fileName)

    # 在新的sheet页添加相应的数据,并保证数据不被覆盖
    def excel_data(self, sr3, tag_Num):
        """
        :param sr3:
        :type tag_Num: object
        """
        book = load_workbook(self.fileName)
        writer = pd.ExcelWriter(self.fileName, mode="a", engine="openpyxl")
        writer.book = book
        sr1 = pd.DataFrame(sr3, index=None)
        sr1.to_excel(writer, index=False, header=True, sheet_name=f"{tag_Num}")
        writer.save()  # 这步才生成文件
        writer.close()

    # 获取豆瓣电影接口数据
    @staticmethod
    def movie_request(tag_Num, page_s):
        """
        :type page_s: object
        :type tag_Num: object
        """
        url = "https://movie.douban.com/j/search_subjects"

        params = {
            "type": "movie",
            "tag": tag_Num,
            "sort": "recommend",
            "page_limit": 20,
            "page_start": page_s
        }

        headers = {
            "Authorization": "token",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54"
        }

        movies = re.get(url, params=params, headers=headers)
        js = movies.json()
        js = js["subjects"]
        data = json.dumps(js, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False)  # 字符串转化
        print(data)
        return js

    # 获取电影详情
    @staticmethod
    def movie_details(movieID):

        url = "https://movie.douban.com/j/subject_abstract?"

        params = {
            "subject_id": movieID
        }

        headers = {
            "Authorization": "token",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54"
        }

        movies = re.get(url, params=params, headers=headers)
        js = movies.json()
        js = js["subject"]
        # data = json.dumps(js, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False)  # 字符串转化
        # print(data)
        return js


if __name__ == '__main__':
    DouBan = DouBan()
    DouBan.douban_movie()
    

生成的Excel表格如下,如果可以生成更漂亮的表格请指教! 表格数据