- 查看request库,BeautifulSoup库,xlwt库是否安装,如果未安装:
pip -y install request bs4 xlwt
- 先学习线程的相关知识
- 爬取豆瓣电影排行榜前top250,包含电影名称,电影评价人数,电影的评分
import threading
import requests
from bs4 import BeautifulSoup
import time
import xlwt
def movie_info(new_url):
global row
moive_list = []
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"}
lock.acquire()
respon = requests.get(new_url,headers=headers)
print("当前的url:{},状态码为:{}".format(new_url,respon.status_code))
print("当前线程:{},正在爬取{},".format(threading.current_thread().name,new_url))
soup = BeautifulSoup(respon.text,"lxml")
# print(soup)
movie_title = ["\n".join(i.text.split("\n")) for i in
soup.select("#content > div > div.article > ol > li> div > div.info > div.hd > a > span:nth-child(1)")]
movie_rank_num = [ "\n".join(i.text.split("\n")) for i in
soup.select("#content > div > div.article > ol >li> div > div.info > div.bd > div > span.rating_num")
]
movie_people = [
i.text.split("人评价")[0] for i in
soup.select("#content > div > div.article > ol > li > div > div.info > div.bd > div > span:nth-child(4)")
]
#content > div > div.article > ol > li:nth-child(1) > div > div.info > div.bd > div > span:nth-child(4)
#content > div > div.article > ol > li:nth-child(2) > div > div.info > div.bd > div > span:nth-child(4)
#content > div > div.article > ol > li:nth-child(1) > div > div.info > div.bd > div > span.rating_num
#content > div > div.article > ol > li:nth-child(2) > div > div.info > div.bd > div > span.rating_num
for name,rank,people in zip(movie_title,movie_rank_num,movie_people):
moive_list.append([name,rank,people])
for info in moive_list:
col = 0
for data in info:
sheet.write(row,col,data)
col += 1
row += 1
movie_sheet.save("电影top200.xlsx")
time.sleep(1)
lock.release()
print(movie_title)
print(movie_rank_num)
print(movie_people)
if __name__ == "__main__":
movie_sheet = xlwt.Workbook(encoding="utf-8")
heading = ["电影名称","评价分数","评价人数"]
sheet = movie_sheet.add_sheet("电影top200")
for cols in range(len(heading)):
sheet.write(0,cols,heading[cols])
row = 1
# movie_list = []
# movie_titles = []
lock = threading.Condition()
url = "https://movie.douban.com/top250?start={}&filter="
for i in range(9):
thre = threading.Thread(target=movie_info,args=(url.format(i*25),))
thre.start()
# thre.join()
- 本文代码完成,诚请品读,若有错误之处,欢迎评论区留言!