数据提取
什么是数据提取?
简单的来说,数据提取就是从相应中获取我们想要的数据的过程
数据分类
- ⾮结构化数据:HTML
- 处理⽅法:正则表达式、xpath
- 结构化数据:json、xml
- 处理⽅法:转化为Python数据类型
数据提取之json
由于把json数据转化为python内建数据类型很简单,所以爬⾍中,如果我们能 够找到返回json数据的URL,就会尽量使⽤这种URL
JSON是⼀种轻量级的数据交换格式,它使得⼈们很容易的进⾏阅读和编写。同 时也⽅便了机器进⾏解析和⽣成。适⽤于进⾏数据交互的场景,⽐如⽹站前台 与后台之间的数据交互
使⽤json注意点
- json中的字符串都是双引号
- curl command 转 Python Requests 网站: www.lddgo.net/convert/cur…
掘金动态爬取
# -*- coding: UTF-8 -*-
# @Project :网络爬虫
# @File :抓取 艳烔 掘金动态.py
# @IDE :PyCharm
# @Author :艳烔
# @Date :2024/10/25 11:51
# 转换网站: https://www.lddgo.net/convert/curl-to-code
import requests
import json
url = 'https://api.juejin.cn/user_api/v1/user/dynamic?user_id=3701422921094089&cursor=20&aid=2608&uuid=7417859385638143499&'
cookies = {
'_tea_utm_cache_2608': 'undefined',
'__tea_cookie_tokens_2608': '%257B%2522web_id%2522%253A%25227417859385638143499%2522%252C%2522user_unique_id%2522%253A%25227417859385638143499%2522%252C%2522timestamp%2522%253A1727105001245%257D',
'csrf_session_id': '5eb2f196777d9a821701fab130cc2433',
}
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
# Requests sorts cookies= alphabetically
# 'Cookie': '_tea_utm_cache_2608=undefined; __tea_cookie_tokens_2608=%257B%2522web_id%2522%253A%25227417859385638143499%2522%252C%2522user_unique_id%2522%253A%25227417859385638143499%2522%252C%2522timestamp%2522%253A1727105001245%257D; csrf_session_id=5eb2f196777d9a821701fab130cc2433',
'Origin': 'https://juejin.cn',
'Referer': 'https://juejin.cn/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'user_id': '3701422921094089',
'cursor': '20',
'aid': '2608',
'uuid': '7417859385638143499',
'spider': '0',
}
response = requests.get('https://api.juejin.cn/user_api/v1/user/dynamic', params=params, cookies=cookies, headers=headers)
json_loads = json.loads(response.text)
data_list_ = json_loads['data']['list']
print(data_list_[3])
for data_list in data_list_:
print(data_list)
王者荣耀案例请求 请求网站:gamehelper.gm825.com/wzry/hero/l…
# -*- coding: UTF-8 -*-
# @Project :网络爬虫
# @File :王者荣耀案例请求.py
# @IDE :PyCharm
# @Author :艳烔
# @Date :2024/10/25 15:11
import requests
import json
from retrying import retry
class WZRY:
def __init__(self, url):
self.url = url
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
}
# 下载王者荣耀英雄图片
# 获取图片名字和图片链接
def get_img(self):
response = requests.get(self.url, self.headers)
json_loads = json.loads(response.text)
list_ = json_loads['list']
return list_
# 获取图片二进制文件
@retry(stop_max_attempt_number=3)
def get_img_binary(self,url):
response = requests.get(url)
assert response.status_code == 200
content = response.content
return content
# 保存图片
def save_img(self):
list_ = self.get_img()
for img in list_:
name = img['name']
print(name+'.jpg 正在下载 '+img['cover'])
with open(f'E:/Study/code/Python/图片爬取/王者爬取/{name}.jpg', 'wb') as f:
binary = self.get_img_binary(img['cover'])
f.write(binary)
# 运行
def run(self):
print('开始下载')
try:
self.save_img()
except Exception as e:
print('下载失败:'+e)
finally:
print('下载完成')
if __name__ == '__main__':
url = 'http://gamehelper.gm825.com/wzry/hero/list'
wzry = WZRY(url)
wzry.run()