尝试爬取“dytt89.com/”网站中2021必看热… 并提取其链接,以及该电影的下载链接
import requests
movie = "https://dytt89.com/"
result = requests.get(movie)
print(result.text)
编码不对,设置编码格式¶
import requests
movie = "https://dytt89.com/"
result = requests.get(movie)
# 指定字符集
result.encoding = 'gb2312'
print(result.text)
运行正常
这里的gb2312换成gbk也是可以的,因为gbk包括gb2312
开始寻找我们需要的数据
import requests
import re
movie = "https://dytt89.com/"
result = requests.get(movie)
# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'
# print(result.text)
# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
result1 = obj1.finditer(result.text)
# 这里不用for循环一样可以实现,因为返回的只有一个结果
for a in result1:
print(a.group("ul"))
获取链接,标题,时间
import requests
import re
movie = "https://dytt89.com/"
result = requests.get(movie)
# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'
# print(result.text)
# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'.*?>(?P<title>.*?)</a><span><font color=#FF0000>(?P<data>.*?)</font>",re.S)
result1 = obj1.finditer(result.text)
for a in result1:
u = a.group("ul")
# 获取子页面链接
result2 = obj2.finditer(u)
for b in result2:
print(b.group("href"))
print(b.group("title"))
print(b.group("data"))
子网页链接是由主链接跳转去的
import requests
import re
movie = "https://dytt89.com/"
result = requests.get(movie)
# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'
# print(result.text)
# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'",re.S)
result1 = obj1.finditer(result.text)
for a in result1:
u = a.group("ul")
# 拼接子页面链接:
result2 = obj2.finditer(u)
for b in result2:
a_herf = movie+b.group("href")
b_herf = movie+b.group("href").strip("/")
print(a_herf)
print(b_herf)
import requests
import re
movie = "https://dytt89.com/"
result = requests.get(movie)
# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'
# print(result.text)
# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'",re.S)
result1 = obj1.finditer(result.text)
b_herf_list = []
for a in result1:
u = a.group("ul")
# 拼接子页面链接:
result2 = obj2.finditer(u)
for b in result2:
b_herf = movie+b.group("href").strip("/")
# 将子页面链接保存起来
b_herf_list.append(b_herf)
print(b_herf_list)
跳转子页面查看
import requests
import re
movie = "https://dytt89.com/"
result = requests.get(movie)
# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'
# print(result.text)
# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'",re.S)
result1 = obj1.finditer(result.text)
b_herf_list = []
for a in result1:
u = a.group("ul")
# 拼接子页面链接:
result2 = obj2.finditer(u)
for b in result2:
b_herf = movie+b.group("href").strip("/")
# 将子页面链接保存起来
b_herf_list.append(b_herf)
# 提取子页面的内容
for url in b_herf_list:
result3 = requests.get(url)
result3.encoding = 'gbk'
print(result3.text)
break
能够正常跳转子页面
进行下一步
import requests
import re
movie = "https://dytt89.com/"
result = requests.get(movie)
# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'
# print(result.text)
# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'",re.S)
obj3 = re.compile(r'◎片 名 (?P<title>.*?)'
r'<br />.*?style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<down_url>.*?)">',re.S)
result1 = obj1.finditer(result.text)
b_herf_list = []
for a in result1:
u = a.group("ul")
# 拼接子页面链接:
result2 = obj2.finditer(u)
for b in result2:
b_herf = movie+b.group("href").strip("/")
# 将子页面链接保存起来
b_herf_list.append(b_herf)
# 提取子页面的内容
for url in b_herf_list:
result3 = requests.get(url)
result3.encoding = 'gbk'
result4 = obj3.search(result3.text)
print(result4.group("title"))
print(result4.group("down_url"))
因为进入子链接后的标题有点问题,所以采用主页面的标题
import requests
import re
movie = "https://dytt89.com/"
result = requests.get(movie)
# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'
# print(result.text)
# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'.*?>(?P<title>.*?)</a>",re.S)
obj3 = re.compile(r'◎片 名 (?P<title>.*?)'
r'<br />.*?style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<down_url>.*?)">',re.S)
result1 = obj1.finditer(result.text)
b_herf_list = []
b_title_list = []
for a in result1:
u = a.group("ul")
# 拼接子页面链接:
result2 = obj2.finditer(u)
for b in result2:
b_title = b.group("title")
b_title_list.append(b_title)
b_herf = movie+b.group("href").strip("/")
# 将子页面链接保存起来
b_herf_list.append(b_herf)
# 提取子页面的内容
i = 0
for url in b_herf_list:
result3 = requests.get(url)
result3.encoding = 'gbk'
result4 = obj3.search(result3.text)
print('片名:')
print(b_title_list[i])
i+=1
print(result4.group("title"))
print('迅雷下载链接:')
print(result4.group("down_url"))
print("_ "*100) #分割符