爬虫学习-实际案例操作

176 阅读3分钟

尝试爬取“dytt89.com/”网站中2021必看热… 并提取其链接,以及该电影的下载链接

import requests

movie = "https://dytt89.com/"
result = requests.get(movie)

print(result.text)

image.png

编码不对,设置编码格式

import requests

movie = "https://dytt89.com/"
result = requests.get(movie) 

# 指定字符集
result.encoding = 'gb2312'

print(result.text)

image.png

运行正常

这里的gb2312换成gbk也是可以的,因为gbk包括gb2312

开始寻找我们需要的数据

import requests
import re

movie = "https://dytt89.com/"
result = requests.get(movie) 

# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'

# print(result.text)

# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
result1 = obj1.finditer(result.text)

# 这里不用for循环一样可以实现,因为返回的只有一个结果
for a in result1:
    print(a.group("ul"))

image.png

获取链接,标题,时间

import requests
import re

movie = "https://dytt89.com/"
result = requests.get(movie) 

# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'

# print(result.text)

# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'.*?>(?P<title>.*?)</a><span><font color=#FF0000>(?P<data>.*?)</font>",re.S)
result1 = obj1.finditer(result.text)

for a in result1:
    u = a.group("ul")
    
# 获取子页面链接
    result2 = obj2.finditer(u)
    for b in result2:
        print(b.group("href"))
        print(b.group("title"))
        print(b.group("data"))

image.png

子网页链接是由主链接跳转去的

import requests
import re

movie = "https://dytt89.com/"
result = requests.get(movie) 

# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'

# print(result.text)

# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'",re.S)
result1 = obj1.finditer(result.text)

for a in result1:
    u = a.group("ul")
    
#   拼接子页面链接:
    result2 = obj2.finditer(u)
    for b in result2:
        a_herf = movie+b.group("href")
        b_herf = movie+b.group("href").strip("/")
        print(a_herf)
        print(b_herf)
        

image.png

import requests
import re

movie = "https://dytt89.com/"
result = requests.get(movie) 

# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'

# print(result.text)

# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'",re.S)
result1 = obj1.finditer(result.text)

b_herf_list = []
for a in result1:
    u = a.group("ul")
    
#   拼接子页面链接:
    result2 = obj2.finditer(u)
    for b in result2:
        b_herf = movie+b.group("href").strip("/")
#       将子页面链接保存起来
        b_herf_list.append(b_herf)
print(b_herf_list)

image.png

跳转子页面查看

import requests
import re

movie = "https://dytt89.com/"
result = requests.get(movie) 

# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'

# print(result.text)

# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'",re.S)
result1 = obj1.finditer(result.text)

b_herf_list = []
for a in result1:
    u = a.group("ul")
    
#   拼接子页面链接:
    result2 = obj2.finditer(u)
    for b in result2:
        b_herf = movie+b.group("href").strip("/")
#       将子页面链接保存起来
        b_herf_list.append(b_herf)
    
# 提取子页面的内容
for url in b_herf_list:
    result3 = requests.get(url)
    result3.encoding = 'gbk'
    print(result3.text)
    break

image.png

能够正常跳转子页面

进行下一步

import requests
import re

movie = "https://dytt89.com/"
result = requests.get(movie) 

# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'

# print(result.text)

# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'",re.S)
obj3 = re.compile(r'◎片  名 (?P<title>.*?)'
                  r'<br />.*?style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<down_url>.*?)">',re.S)
result1 = obj1.finditer(result.text)

b_herf_list = []
for a in result1:
    u = a.group("ul")
    
#   拼接子页面链接:
    result2 = obj2.finditer(u)
    for b in result2:
        b_herf = movie+b.group("href").strip("/")
#       将子页面链接保存起来
        b_herf_list.append(b_herf)
    
# 提取子页面的内容
for url in b_herf_list:
    result3 = requests.get(url)
    result3.encoding = 'gbk'
    result4 = obj3.search(result3.text)
    print(result4.group("title"))
    print(result4.group("down_url"))

image.png

因为进入子链接后的标题有点问题,所以采用主页面的标题

import requests
import re

movie = "https://dytt89.com/"
result = requests.get(movie) 

# 指定字符集(gbk包含gb2312)
result.encoding = 'gbk'

# print(result.text)

# 拿出ul内容
obj1 = re.compile(r'2021必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S)
obj2 = re.compile(r"href='(?P<href>.*?)'.*?>(?P<title>.*?)</a>",re.S)
obj3 = re.compile(r'◎片  名 (?P<title>.*?)'
                  r'<br />.*?style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<down_url>.*?)">',re.S)
result1 = obj1.finditer(result.text)

b_herf_list = []
b_title_list = []
for a in result1:
    u = a.group("ul")
    
#   拼接子页面链接:
    result2 = obj2.finditer(u)
    for b in result2:
        b_title = b.group("title")
        b_title_list.append(b_title)
        b_herf = movie+b.group("href").strip("/")
#       将子页面链接保存起来
        b_herf_list.append(b_herf)
    
# 提取子页面的内容
i = 0
for url in b_herf_list:
    
    result3 = requests.get(url)
    result3.encoding = 'gbk'
    result4 = obj3.search(result3.text)
    print('片名:')
    print(b_title_list[i])
    i+=1
    print(result4.group("title"))
    print('迅雷下载链接:')
    print(result4.group("down_url"))
    print("_ "*100) #分割符
     

image.png

大功告成