import requests
from bs4 import BeautifulSoup
函数1:请求网页
def page_request(url, ua):
response = requests.get(url, headers=ua)
response.raise_for_status() # 检查请求是否成功
html = response.text
return html
函数2:解析网页并获取详细信息(包含基本信息和额外信息)
def parse_movie_info(soup):
movies = soup.select('div.item') # 根据实际HTML结构选择正确的选择器
detailed_list = []
for movie in movies:
movie_info = {}
# 基本信息
ranking = movie.find('em').text if movie.find('em') else 'N/A'
title = movie.find('span', class_='title').text if movie.find('span', class_='title') else 'N/A'
rating_num = movie.find('span', class_='rating_num').text if movie.find('span', class_='rating_num') else 'N/A'
# 链接(注意:这里可能需要调整,因为实际的链接可能不是直接这样拼接的)
link = 'movie.douban.com' + movie.find('a')['href'] if movie.find('a') else 'N/A'
# 额外信息
bd = movie.find('div', class_='bd')
if bd:
director_info = bd.find('p').text.strip().split('\n')[0].replace('导演:', '').strip() if bd.find('p') else 'N/A'
writer_info = (bd.find_all('p')[1].text.strip().split('\n')[0].replace('编剧:', '').strip() if len(bd.find_all('p')) > 1 else 'N/A')
stars_info = (bd.find_all('p')[2].text.strip().split('\n')[0].replace('主演:', '').strip() if len(bd.find_all('p')) > 2 else 'N/A')
# 类型、上映时间、片长等信息可能不在bd下,需要根据实际HTML调整选择器
# 这里假设它们在bd下的某个span或div中,但可能需要更具体的class或属性来选择
genre_info = (bd.find('span', class_='inq').text.strip() if bd.find('span', class_='inq') else 'N/A')
release_date = (bd.find('span', class_='pl').text.strip().replace('上映时间:', '').strip() if bd.find('span', class_='pl') else 'N/A')
# 注意:如果豆瓣页面结构变化,以下选择器可能需要调整
runtime_tag = bd.find_all('span', class_='pl')
runtime = (runtime_tag[1].text.strip().replace('片长:', '').strip() if len(runtime_tag) > 1 else 'N/A') # 假设片长在第二个span.pl中
rating_count = (movie.find('div', class_='star').find_all('span')[-1].text.strip().replace('人评价', '').strip() if movie.find('div', class_='star').find_all('span') else 'N/A')
summary = (bd.find('span', property='v:summary').text.strip() if bd.find('span', property='v:summary') else 'N/A')
movie_info = {
'ranking': ranking,
'title': title,
'rating': rating_num,
'director': director_info,
'writer': writer_info,
'stars': stars_info,
'genre': genre_info,
'release_date': release_date,
'runtime': runtime,
'rating_count': rating_count,
'summary': summary,
'link': link
}
detailed_list.append(movie_info)
return detailed_list
函数3:保存详细信息到本地文件
def save_to_file(movie_list, filename):
with open(filename, 'w', encoding='utf-8') as file:
for movie in movie_list:
line = f"{movie['ranking']}-{movie['title']}-{movie['rating']}-{movie['link']}\n"
line += f"导演: {movie['director']}\n编剧: {movie['writer']}\n主演: {movie['stars']}\n类型: {movie['genre']}\n上映时间: {movie['release_date']}\n片长: {movie['runtime']}\n评分人数: {movie['rating_count']}\n剧情简介: {movie['summary']}\n\n"
file.write(line)
主函数
def main():
ua = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
base_url = 'movie.douban.com/top250'
all_movies = []
for start in range(0, 250, 25): # 豆瓣Top250,每页25部,共10页
url = f"{base_url}?start={start}"
html_content = page_request(url, ua)
soup = BeautifulSoup(html_content, 'html.parser') # 解析HTML字符串
# 获取详细信息并存储
movie_info_list = parse_movie_info(soup)
all_movies.extend(movie_info_list)
# 保存所有详细信息到文件
save_to_file(all_movies, 'douban_movies.txt')
if name == 'main':
main()