爬虫数据-json

108 阅读3分钟

数据提取

什么是数据提取?

简单的来说,数据提取就是从相应中获取我们想要的数据的过程

数据分类

  • ⾮结构化数据:HTML
    • 处理⽅法:正则表达式、xpath
  • 结构化数据:json、xml
    • 处理⽅法:转化为Python数据类型

数据提取之json

由于把json数据转化为python内建数据类型很简单,所以爬⾍中,如果我们能 够找到返回json数据的URL,就会尽量使⽤这种URL

JSON是⼀种轻量级的数据交换格式,它使得⼈们很容易的进⾏阅读和编写。同 时也⽅便了机器进⾏解析和⽣成。适⽤于进⾏数据交互的场景,⽐如⽹站前台 与后台之间的数据交互

使⽤json注意点

  • json中的字符串都是双引号

图片.png

掘金动态爬取


# -*- coding: UTF-8 -*-  
# @Project :网络爬虫  
# @File :抓取 艳烔 掘金动态.py  
# @IDE :PyCharm  
# @Author :艳烔  
# @Date :2024/10/25 11:51  
  
# 转换网站: https://www.lddgo.net/convert/curl-to-code  
  
import requests  
import json  
  
url = 'https://api.juejin.cn/user_api/v1/user/dynamic?user_id=3701422921094089&cursor=20&aid=2608&uuid=7417859385638143499&'  
  
cookies = {  
'_tea_utm_cache_2608': 'undefined',  
'__tea_cookie_tokens_2608': '%257B%2522web_id%2522%253A%25227417859385638143499%2522%252C%2522user_unique_id%2522%253A%25227417859385638143499%2522%252C%2522timestamp%2522%253A1727105001245%257D',  
'csrf_session_id': '5eb2f196777d9a821701fab130cc2433',  
}  
  
headers = {  
'Accept': '*/*',  
'Accept-Language': 'zh-CN,zh;q=0.9',  
'Connection': 'keep-alive',  
'Content-Type': 'application/json',  
# Requests sorts cookies= alphabetically  
# 'Cookie': '_tea_utm_cache_2608=undefined; __tea_cookie_tokens_2608=%257B%2522web_id%2522%253A%25227417859385638143499%2522%252C%2522user_unique_id%2522%253A%25227417859385638143499%2522%252C%2522timestamp%2522%253A1727105001245%257D; csrf_session_id=5eb2f196777d9a821701fab130cc2433',  
'Origin': 'https://juejin.cn',  
'Referer': 'https://juejin.cn/',  
'Sec-Fetch-Dest': 'empty',  
'Sec-Fetch-Mode': 'cors',  
'Sec-Fetch-Site': 'same-site',  
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',  
'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',  
'sec-ch-ua-mobile': '?0',  
'sec-ch-ua-platform': '"Windows"',  
}  
  
params = {  
'user_id': '3701422921094089',  
'cursor': '20',  
'aid': '2608',  
'uuid': '7417859385638143499',  
'spider': '0',  
}  
  
response = requests.get('https://api.juejin.cn/user_api/v1/user/dynamic', params=params, cookies=cookies, headers=headers)  
  
  
json_loads = json.loads(response.text)  
data_list_ = json_loads['data']['list']  
  
print(data_list_[3])  
  
for data_list in data_list_:  
print(data_list)

王者荣耀案例请求 请求网站:gamehelper.gm825.com/wzry/hero/l…

# -*- coding: UTF-8 -*-  
# @Project :网络爬虫  
# @File :王者荣耀案例请求.py  
# @IDE :PyCharm  
# @Author :艳烔  
# @Date :2024/10/25 15:11  
  
  
import requests  
import json  
from retrying import retry  
  
class WZRY:  
    def __init__(self, url):  
        self.url = url  
        self.headers = {  
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'  
        }  
    # 下载王者荣耀英雄图片  
    # 获取图片名字和图片链接  
    def get_img(self):  
        response = requests.get(self.url, self.headers)  
        json_loads = json.loads(response.text)  
        list_ = json_loads['list']  
        return list_  
    # 获取图片二进制文件  
    @retry(stop_max_attempt_number=3)  
    def get_img_binary(self,url):  
        response = requests.get(url)  
        assert response.status_code == 200  
        content = response.content  
        return content  

    # 保存图片  
    def save_img(self):  
        list_ = self.get_img()  
        for img in list_:  
        name = img['name']  
        print(name+'.jpg 正在下载 '+img['cover'])  
        with open(f'E:/Study/code/Python/图片爬取/王者爬取/{name}.jpg', 'wb') as f:  
        binary = self.get_img_binary(img['cover'])  
        f.write(binary)  

    # 运行  
    def run(self):  
        print('开始下载')  
        try:  
        self.save_img()  
        except Exception as e:  
        print('下载失败:'+e)  
        finally:  
        print('下载完成')  
  
if __name__ == '__main__':  
    url = 'http://gamehelper.gm825.com/wzry/hero/list'  
    wzry = WZRY(url)  
    wzry.run()