获取豆瓣评分排行榜电影

204 阅读1分钟
# coding=utf-8
import re
import requests
import csv

agent = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
}
url = "https://movie.douban.com/top250"
res = requests.get(url,headers=agent)
page_content = res.text
#开始写正则
obj = re.compile(r'<li>.*?<div class="item">.*?<div class="pic">.*?<em class="">.*?</em>.*?<a href="(?P<url_link>.*?)">.*?'
                 r'<img width="100" alt="(?P<movie_name>.*?)" src=".*?" class="">.*?</a>.*?</div>.*?<div class="info">.*?'
                 r'<div class="hd">.*?<a href=".*?" class="">.*?<span class="title">.*?</span>.*?<span class=".*?">&nbsp;/&nbsp;.*?'
                 r'</span>.*?<span class=".*?">&nbsp;/&nbsp;.*?</span>.*?</a>.*?<span class=".*?">.*?</span>.*?</div>.*?<div class="bd">'
                 r'.*?<p class="">.*?<br>(?P<year>.*?)&nbsp;/&nbsp;.*?&nbsp;/&nbsp;.*?</p>.*?<div class="star">.*?<span class=".*?'
                 r'"></span>.*?<span class="rating_num" property="v:average">(?P<averge>.*?)</span>.*?<span property="v:best" content="10.0"></span>'
                 r'.*?<span>.*?</span>.*?</div>.*?<p class="quote">.*?<span class="inq">.*?</span>.*?</p>.*?</div>.*?</div>.*?</div>',re.S)
result = obj.finditer(page_content)     #正则代入页面源代码开始匹配
movie_csv = open("douban.csv",mode='w')     #创建一个豆瓣CSV文件,写入模式
writer_info = csv.writer(movie_csv)     #创建一个接收文件的参数
for i in result:
    a = 0
    movie_list = []
    url_list = []
    year_list = []
    averge_list = []
    year_list.append(int(i.group('year'.strip())))      # strip表示去掉年份前面的空白
    movie_list.append(i.group('movie_name'))
    url_list.append(i.group('url_link'))
    averge_list.append(i.group('averge'))
    # print('电影名字:{},播放链接:{},上映年份:{},电影评分:{}'.format('movie_list,url_list,year_list,averge_list'))
    # print(i.group('movie_name','url_link','year_list','averge'))
    b = '电影名字:{}'.format(movie_list[a])     #给list都增加一个头部説明
    c = '播放链接:{}'.format(url_list[a])
    d = '上映年份:{}'.format(year_list)
    e = '电影评分:{}'.format(averge_list)
    f = (b,c,d,e)       #集中到一个变量中以便写入cvs表中
    writer_info.writerow(f)         #writerow只接受一个参数,所以要把列表排好序传给writer_info
    print(b,'\t',c,'\t',d,'\t',e)
    a += 1
movie_csv.close()   #关闭文件操作
res.close()     #关闭浏览器连接