多线程爬取豆瓣电影排行榜前top200

35 阅读1分钟
  1. 查看request库,BeautifulSoup库,xlwt库是否安装,如果未安装:

pip -y install request bs4 xlwt

  1. 先学习线程的相关知识
  2. 爬取豆瓣电影排行榜前top250,包含电影名称,电影评价人数,电影的评分
import threading 
import requests
from bs4 import BeautifulSoup
import time
import xlwt
def movie_info(new_url):
    global row 
    moive_list = []
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"}
 
    lock.acquire()
    respon = requests.get(new_url,headers=headers)
 
    print("当前的url:{},状态码为:{}".format(new_url,respon.status_code))
 
    print("当前线程:{},正在爬取{},".format(threading.current_thread().name,new_url))
    soup = BeautifulSoup(respon.text,"lxml")
 
    # print(soup)
    movie_title = ["\n".join(i.text.split("\n")) for i in 
                   soup.select("#content > div > div.article > ol > li> div > div.info > div.hd > a > span:nth-child(1)")]
 
 
    movie_rank_num = [ "\n".join(i.text.split("\n")) for i in 
                  soup.select("#content > div > div.article > ol >li> div > div.info > div.bd > div > span.rating_num")
                  ]
    
 
    movie_people = [
                 i.text.split("人评价")[0] for i in 
                soup.select("#content > div > div.article > ol > li > div > div.info > div.bd > div > span:nth-child(4)")
    ]
 
    #content > div > div.article > ol > li:nth-child(1) > div > div.info > div.bd > div > span:nth-child(4)
    #content > div > div.article > ol > li:nth-child(2) > div > div.info > div.bd > div > span:nth-child(4)
 
      #content > div > div.article > ol > li:nth-child(1) > div > div.info > div.bd > div > span.rating_num
      #content > div > div.article > ol > li:nth-child(2) > div > div.info > div.bd > div > span.rating_num
    for name,rank,people in zip(movie_title,movie_rank_num,movie_people):
        moive_list.append([name,rank,people])
    
    
    for info in moive_list:
        col = 0
        for data  in  info:
            sheet.write(row,col,data)
            col += 1
        row += 1
 
    movie_sheet.save("电影top200.xlsx")
 
    time.sleep(1)
 
 
 
    lock.release()
 
    print(movie_title)
    print(movie_rank_num)
    print(movie_people)
 
 
 
if __name__ == "__main__":
    
    movie_sheet = xlwt.Workbook(encoding="utf-8")
    heading = ["电影名称","评价分数","评价人数"]
    sheet = movie_sheet.add_sheet("电影top200")
    for cols in range(len(heading)):
        sheet.write(0,cols,heading[cols])
 
    row = 1
    # movie_list = []
    # movie_titles = []
    lock = threading.Condition()
    url = "https://movie.douban.com/top250?start={}&filter="
    for  i  in  range(9):
        thre = threading.Thread(target=movie_info,args=(url.format(i*25),))
        thre.start()
        # thre.join()
  1. 本文代码完成,诚请品读,若有错误之处,欢迎评论区留言!