一、数据分析
1.请求分析
请求方法
GET:查询参数,在链接地址中查看(显性)
POST:载荷中查看数据(隐性)
2.链接分析
图片、视频、音频、文件下载 ---》链接地址
* 图片:开发者工具--》网络—--》imge
* 音频、视频:开发者工具---》网络---》媒体
3.获取数据
- response.text:文本数据
- response.json():json类型,字典
- response.content:二进制数据
二、代码实现
1.单一章节内容获取
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
url = ('https://comic.mkzcdn.com/chapter/content/v1/?chapter_id=995007&comic_id=209412&format=1&quality=1&sign=9b945c66c02fa7d5c82d0367b4ba56b0&type=1&uid=75918029')
response = requests.get(url= url, headers=headers)
pages = response.json()['data']['page']
num = 1
for page in pages:
img = page['image']
img_content = requests.get(url=img, headers=headers).content
with open('漫客栈\灵剑尊\' + str(num) + '.jpg','wb') as f:
f.write(img_content)
num += 1
print(img)
2.整本书章节分析
517
https://content.mkzcdn.com/image/20240422/662643cd314b8-800x1169.jpg!page-800-x?auth_key=1727716595-0-0-a778356b788bc17314fe1be0b1c2c848
518
https://content.mkzcdn.com/image/20240422/662643b708319-800x1504.jpg!page-800-x?auth_key=1727715940-0-0-ab2f39c09b944f472860da96e357ad06
519
https://content.mkzcdn.com/image/20240422/662643bc73318-800x1656.jpg!page-800-x?auth_key=1727715828-0-0-ac4b8e397425b8bbe7105580bd3949fc
zip函数是 Python 中的一个内置函数,它用于将多个可迭代对象(如列表、元组等)中对应的元素打包成一个个元组,然后返回这些元组组成的迭代器。如果各个可迭代对象的长度不一致,zip函数会以最短的可迭代对象为准进行打包。
names = ['Alice', 'Bob', 'Charlie']
ages = [25, 30, 35]
zipped = zip(names, ages)
for name, age in zipped:
print(f'{name} is {age} years old.')
将标题和网络链接对应(并没有实现功能)
import requests
import re
from lxml import etree,html
#url = 'https://www.mkzhan.com/209412/'
headers = {
"User - Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q = 0.9,image/avif,image/webp,image/apng,*/*;q = 0.8",
"Accept - Encoding": "gzip, deflate, br",
"Accept - Language": "en - US,en;q = 0.9"
}
def get_response(url,headers):
response = requests.get(url,headers=headers).text
return response
def get_chapter(url,header):
# 获取整本书的所有章节信息
response = get_response(url,headers)
#print(response)
# re方法获取所需要的值
data_list= re.findall(r'data-hreflink="(.*?)"',response)
#print(data_list)
url_list = ['https://www.mkzhan.com' + str(url) for url in data_list]
tree = html.fromstring(response)
titles = tree.xpath('/html/body/div[3]/div[1]/div[1]/div[2]/ul/li/a/text()')
title_list = [title.strip() for title in titles if title.strip()]
"print(title_list)"
dict_info = dict(zip(url_list,title_list))
#print(dict_info)
for url,title in dict_info.items():
print(url,title)
#get_chapter(url,headers)
#对单章进行获取信息
url = 'https://www.mkzhan.com/209412/1004107.html'
response = get_response(url, headers)
# print(response)
tree = html.fromstring(response)
img = tree.xpath('/html/body/div[2]/div[2]/div[1]/div/img/@data-src')
print(img)
3.完整项目代码
import requests
import re
import os
headers = {
"User - Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
}
for i in range(1, 36):
# 获取全站的book_id
list_url = f"https://www.mkzhan.com/category/?is_vip=1&page={i}"
html = requests.get(list_url, headers=headers).text
id_list = re.findall('<a class="cover" href="/(.*?)/" target="_blank">',html)
title_list = re.findall('<p class="comic-feature">(.*?)</p>',html)
print(id_list,title_list)
for id,name in zip(id_list,title_list):
new_name = re.sub(r'[\/:*?"<>|]', '', name)
path = f'img\{new_name}\'
print(id,new_name)
if not os.path.exists(path):
os.mkdir(path)
# 全文的chapter_id和标题
link = f'https://comic.mkzcdn.com/chapter/v1/?comic_id={id}'
response = requests.get(url=link, headers=headers).json()
content = response['data']
# print(content)
for index in content:
title = index['title']
new_title = re.sub(r'[\/:*?"<>|]', '', title)
chapter_id = index['chapter_id']
url = (
f'https://comic.mkzcdn.com/chapter/content/v1/?chapter_id={chapter_id}&comic_id={id}&format=1&quality=1&sign=9b945c66c02fa7d5c82d0367b4ba56b0&type=1&uid=75918029')
response = requests.get(url=url, headers=headers).json()
pages = response['data']['page']
num = 1
for page in pages:
img = page['image']
img_content = requests.get(url=img, headers=headers).content
with open(path +f'{new_title}--{num}' + '.jpg', 'wb') as f:
f.write(img_content)
num += 1
# print(img)
print(f'{new_title}下载完成')