使用Python获取“豆瓣电影”相关数据,并生成Excel表格
本文使用了requests和pandas第三方库,对豆瓣电影各相关数据进行爬取,并记录至Excel表格内。但发现存在以下三个问题:
- 生成的Excel表格,第一个sheet页为空
- 数据爬取数据过长
- 无法获取每个标签的具体电影数量,所以我取了一个吉祥的数字
520,但请各位大佬指教,若为空,则直接停止,进入下一个标签的爬取
请各位大佬进行指教,并进行优化,小弟在此谢谢!
注:要是觉得文章写得不错,记得留个赞哦!
因为如果一开始不生成一个sheet页,则无法使用
ExcelWriter的方法,小弟的解决方法是使用remove方法进行移除多余的sheet页,代码如下,请各位大佬指教!
# 移除创建表格时多余的sheet页
def del_excel(self):
book = load_workbook(self.fileName)
ws = book["Sheet1"]
book.remove(ws)
book.save(self.fileName)
爬取豆瓣电影相关数据的具体代码如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests as re
from openpyxl import load_workbook
import pandas as pd
import json
class DouBan:
def __init__(self):
self.sr1 = pd.DataFrame(columns=["评分", "电影名"], index=None)
self.fileName = 'C:/Users/Administrator/Desktop/豆瓣电影评分.xlsx'
# sheetName = '豆瓣电影评分'
# 获取检索标签名
self.tag_Movie = {
0: "热门",
1: "最新",
2: "经典",
3: "豆瓣高分",
4: "冷门佳片",
5: "华语",
6: "欧美",
7: "韩国",
8: "日本",
9: "动作",
10: "喜剧",
11: "爱情",
12: "科技",
13: "悬疑",
14: "恐怖",
15: "治愈"
}
def douban_movie(self):
# 创建空文件
self.sr1.to_excel(self.fileName, index=False, header=True)
# 遍历检索标签,获取各标签的数据
for tagNumber in range(0, 16):
tag_Num = self.tag_Movie[tagNumber]
# 将获取的数据暂存在sr3列表内
sr3 = []
for page_s in range(0, 20, 20):
for l in self.movie_request(tag_Num, page_s):
movieDetails = self.movie_details(movieID=l['id'])
sr3.append({
'评分': l['rate'],
"电影名": l['title'],
"导演": json.dumps(movieDetails["directors"], indent=0, separators=('\n', ': '),
ensure_ascii=False),
"主演": json.dumps(movieDetails["actors"], indent=0, separators=('\n', ': '), ensure_ascii=False),
"时长": movieDetails["duration"]
})
self.excel_data(sr3, tag_Num)
self.del_excel() # 移除多余的sheet1页
print("====数据获取成功====")
# 移除创建表格时多余的sheet页
def del_excel(self):
book = load_workbook(self.fileName)
ws = book["Sheet1"]
book.remove(ws)
book.save(self.fileName)
# 在新的sheet页添加相应的数据,并保证数据不被覆盖
def excel_data(self, sr3, tag_Num):
"""
:param sr3:
:type tag_Num: object
"""
book = load_workbook(self.fileName)
writer = pd.ExcelWriter(self.fileName, mode="a", engine="openpyxl")
writer.book = book
sr1 = pd.DataFrame(sr3, index=None)
sr1.to_excel(writer, index=False, header=True, sheet_name=f"{tag_Num}")
writer.save() # 这步才生成文件
writer.close()
# 获取豆瓣电影接口数据
@staticmethod
def movie_request(tag_Num, page_s):
"""
:type page_s: object
:type tag_Num: object
"""
url = "https://movie.douban.com/j/search_subjects"
params = {
"type": "movie",
"tag": tag_Num,
"sort": "recommend",
"page_limit": 20,
"page_start": page_s
}
headers = {
"Authorization": "token",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54"
}
movies = re.get(url, params=params, headers=headers)
js = movies.json()
js = js["subjects"]
data = json.dumps(js, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False) # 字符串转化
print(data)
return js
# 获取电影详情
@staticmethod
def movie_details(movieID):
url = "https://movie.douban.com/j/subject_abstract?"
params = {
"subject_id": movieID
}
headers = {
"Authorization": "token",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54"
}
movies = re.get(url, params=params, headers=headers)
js = movies.json()
js = js["subject"]
# data = json.dumps(js, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False) # 字符串转化
# print(data)
return js
if __name__ == '__main__':
DouBan = DouBan()
DouBan.douban_movie()
生成的Excel表格如下,如果可以生成更漂亮的表格请指教!