爬取电影天堂2021必看榜的种子

246 阅读1分钟
import csv
import requests
import re

url = "https://dytt89.com/"
resp = requests.get(url,verify=False)       #关闭安全验证
resp.encoding = 'gb2312'    #匹配网页编码
page_content = resp.text    # 拿到响应文本
# print(page_content)
# 开始写正则
obj = re.compile(r'2021必看热片.*?<ul>(?P<page1>.*?)</ul>',re.S)
result = obj.finditer(page_content)
movie_download = open("dytt.csv",mode='w')      #创建一个CSV文件
movie_write = csv.writer(movie_download)        #创建一个数据接收器
for it in result:
    page_first = it.group('page1')
    #开始写page1的正则
    page1_ojb = re.compile(r'''<li><a href='(?P<movie1>.*?)' title=".*?">.*?</a><span>.*?</li>''',re.S)
    result1 = page1_ojb.finditer(page_first)
    for itt in result1:
        movie_url_list = []
        page_movie_url_info = itt.group('movie1')   # 把url结果遍历出来
        # print(page_movie_url_info)
        movie_url = url+page_movie_url_info.strip('/')  #去掉page_movie_url_info最前面的'/'
        movie_url_list.append(movie_url)
        for url_listt in movie_url_list:
            details = requests.get(url_listt)
            details.encoding = 'gb2312'
            detaile_content = details.text
            #开始写详情页的正则,过滤有用的信息
            details_obj_name = re.compile(r'<div class="title_all"><h1>(?P<movie_name>.*?)</h1></div>',re.S)
            details_obj_url = re.compile(r'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download_url>.*?)&tr=.*?</a></td>',re.S)
            details_result_name = details_obj_name.finditer(detaile_content)
            details_result_url = details_obj_url.finditer(detaile_content)
            for details_name in details_result_name:
                print(details_name.group('movie_name'))

                # movie_write.writerow(details_name.group('movie_name'))
            for details_url in details_result_url:
                print(details_url.group('download_url'))

                # movie_write.writerow(details_url.group('download_url'))

movie_download.close()
resp.close()
#执行结果

![image.png](https://p1-juejin.byteimg.com/tos-cn-i-k3u1fbpfcp/a2fc7db3a248492ea892f0be5e7fe5a0~tplv-k3u1fbpfcp-watermark.image)