关于豆瓣top排行榜的爬虫

89 阅读2分钟

import requests

from bs4 import BeautifulSoup

 

函数1:请求网页

def page_request(url, ua):

    response = requests.get(url, headers=ua)

    response.raise_for_status()  # 检查请求是否成功

    html = response.text

    return html

 

函数2:解析网页并获取详细信息(包含基本信息和额外信息)

def parse_movie_info(soup):

    movies = soup.select('div.item')  # 根据实际HTML结构选择正确的选择器

    detailed_list = []

 

    for movie in movies:

        movie_info = {}

       

        # 基本信息

        ranking = movie.find('em').text if movie.find('em') else 'N/A'

        title = movie.find('span', class_='title').text if movie.find('span', class_='title') else 'N/A'

        rating_num = movie.find('span', class_='rating_num').text if movie.find('span', class_='rating_num') else 'N/A'

        

        # 链接(注意:这里可能需要调整,因为实际的链接可能不是直接这样拼接的)

        link = 'movie.douban.com' + movie.find('a')['href'] if movie.find('a') else 'N/A'

       

        # 额外信息

        bd = movie.find('div', class_='bd')

        if bd:

            director_info = bd.find('p').text.strip().split('\n')[0].replace('导演:', '').strip() if bd.find('p') else 'N/A'

            writer_info = (bd.find_all('p')[1].text.strip().split('\n')[0].replace('编剧:', '').strip() if len(bd.find_all('p')) > 1 else 'N/A')

            stars_info = (bd.find_all('p')[2].text.strip().split('\n')[0].replace('主演:', '').strip() if len(bd.find_all('p')) > 2 else 'N/A')

           

            # 类型、上映时间、片长等信息可能不在bd下,需要根据实际HTML调整选择器

            # 这里假设它们在bd下的某个span或div中,但可能需要更具体的class或属性来选择

            genre_info = (bd.find('span', class_='inq').text.strip() if bd.find('span', class_='inq') else 'N/A')

            release_date = (bd.find('span', class_='pl').text.strip().replace('上映时间:', '').strip() if bd.find('span', class_='pl') else 'N/A')

            # 注意:如果豆瓣页面结构变化,以下选择器可能需要调整

            runtime_tag = bd.find_all('span', class_='pl')

            runtime = (runtime_tag[1].text.strip().replace('片长:', '').strip() if len(runtime_tag) > 1 else 'N/A')  # 假设片长在第二个span.pl中

            

            rating_count = (movie.find('div', class_='star').find_all('span')[-1].text.strip().replace('人评价', '').strip() if movie.find('div', class_='star').find_all('span') else 'N/A')

            summary = (bd.find('span', property='v:summary').text.strip() if bd.find('span', property='v:summary') else 'N/A')

 

            movie_info = {

                'ranking': ranking,

                'title': title,

                'rating': rating_num,

                'director': director_info,

                'writer': writer_info,

                'stars': stars_info,

                'genre': genre_info,

                'release_date': release_date,

                'runtime': runtime,

                'rating_count': rating_count,

                'summary': summary,

                'link': link

            }

 

        detailed_list.append(movie_info)

 

    return detailed_list

 

函数3:保存详细信息到本地文件

def save_to_file(movie_list, filename):

    with open(filename, 'w', encoding='utf-8') as file:

        for movie in movie_list:

            line = f"{movie['ranking']}-{movie['title']}-{movie['rating']}-{movie['link']}\n"

            line += f"导演: {movie['director']}\n编剧: {movie['writer']}\n主演: {movie['stars']}\n类型: {movie['genre']}\n上映时间: {movie['release_date']}\n片长: {movie['runtime']}\n评分人数: {movie['rating_count']}\n剧情简介: {movie['summary']}\n\n"

            file.write(line)

 

主函数

def main():

    ua = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

    }

    base_url = 'movie.douban.com/top250'

    all_movies = []

 

    for start in range(0, 250, 25):  # 豆瓣Top250,每页25部,共10页

        url = f"{base_url}?start={start}"

        html_content = page_request(url, ua)

        soup = BeautifulSoup(html_content, 'html.parser')  # 解析HTML字符串

 

        # 获取详细信息并存储

        movie_info_list = parse_movie_info(soup)

        all_movies.extend(movie_info_list)

 

    # 保存所有详细信息到文件

    save_to_file(all_movies, 'douban_movies.txt')

 

if name == 'main':

    main()