1. spider爬虫学习纪要
1.0 爬虫技巧
自动生成爬虫代吗网址:curlconverter.com/
1.1 爬取百度页面然后用浏览器打开
from urllib.request import urlopen
url = "http://baidu.com"
resp = urlopen(url)
with open("baidu.html", mode="w") as f:
f.write(resp.read().decode("utf-8"))
print("spider baidu over!")
1.2 爬取豆瓣数据
学习怎么处理反爬,爬虫以后一定要response.close()
url = "https://movie.douban.com/j/chart/top_list"
param = {
"type": "24",
"interval_id": "100:90",
"action":"",
"start":0,
"limit":20
}
# 处理反爬
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
response = requests.get(url = url, params = param, headers = headers)
print(response.request.url)
print(response.request.headers)
print(response.json())
response.close()
1.3 数据解析
数据解析的三种方式
- re解析
- bs4解析
- xpath解析
1.3.1 正则解析
工具集: tool.oschina.net/
在线正则测试:tool.oschina.net/regex
1.3.2 正则使用
# 查找所有的
lst= re.findall(r"\d+", "我的电话号是10086,我女朋友的电话是10010")
print(lst)
print("-----------------------------")
# 匹配字符串中所有的内容,返回的是迭代器
it= re.finditer(r"\d+", "我的电话号是10086,我女朋友的电话是10010")
print(lst)
for i in it:
print(i.group())
print("-----------------------------")
# search找到一个结果就返回,返回的是match对象
s = re.search(r"\d+", "我的电话号是10086,我女朋友的电话是10010")
print(s.group())
print("-----------------------------")
# match是从头开始匹配
s = re.match(r"\d+", "10086,我女朋友的电话是10010")
print(s.group())
print("-----------------------------")
# 预加载正则表达式
s = re.compile(r"\d+")
# (?P<dyw>.*?)可以单独正则匹配的内容中进一步提取内容
s = """
<div class = 'gj'><span id = '1'> 郭靖</span></div>
<div class = 'hr'><span id = '2'> 黄蓉</span></div>
<div class = 'dx'><span id = '3'> 东邪</span></div>
<div class = 'xd'><span id = '4'> 西毒</span></div>
<div class = 'nd'><span id = '5'> 南帝</span></div>
<div class = 'bj'><span id = '6'> 北丐</span></div>
"""
# re.S能让. 匹配换行符
obj = re.compile(r"<div class = '.*?'><span id = '\d+'>(?P<dyw>.*?)</span></div>", re.S)
result = obj.finditer(s)
for it in result:
print(it.group("dyw"))
print("-----")
1.3.3 xpath 使用
1.4 梨视频网站防盗链爬取视频
refer 就是防盗链的获取,主要还是要分析网站各个地址
import requests
url = "https://www.pearvideo.com/video_1793545"
contId = url.split("_")[1]
videoStatusUrl = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}"
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Referer": url
}
resp = requests.get(videoStatusUrl, headers=headers)
dict_data = resp.json()
src_url = dict_data["videoInfo"]["videos"]["srcUrl"]
systemTime = dict_data["systemTime"]
src_url = src_url.replace(systemTime, f"cont-{contId}")
print(src_url)
with open("a.mp4", "wb") as f:
f.write(requests.get(src_url).content)
1.5 爬虫代理
获取爬虫代理IP: www.zdaye.com/free/
1.6 协程爬取百度小说
import asyncio
import json
import aiohttp
import requests
import aiofiles
# 每个章节的内容
# pip install aiofiles, pip install aiohttp
async def aiodownload(cid, b_id, title):
data = {
"book_id":b_id,
"cid":f"{b_id}|{cid}",
"need_bookinfo":1
}
data = json.dumps(data)
url = f'http://dushu.baidu.com/api/pc/getChapterContent?data={data}'
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
dic = await resp.json()
async with aiofiles.open(title, mode='w', encoding="utf-8") as f:
await f.write(dic['data']['novel']['content'])
async def getCatalog(url):
resp = requests.get(url)
chapter_dic = resp.json()['data']['novel']['items']
tasks = []
for item in chapter_dic:
cid = item["cid"]
title = item["title"]
tasks.append(aiodownload(cid, b_id, title))
await asyncio.wait(tasks)
if __name__ == '__main__':
b_id = "4306063500"
url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + b_id + '"}'
asyncio.run(getCatalog(url))
第71个没有看到
1.7 逆向爬虫
快速的生成python爬虫文件:curlconverter.com/
1.8 python利用selenium实现大麦网抢票
1.9 selenium学习
能不能让我的程序连接到浏览器 。 让浏览器来完成各种复杂的操作,我们只接受最终的结果 selenium:自动化测试工具, 可以:打开浏览器,然后像人一样去操作浏览器, 程序员可以从selenium中直接提取网页上的各种信息
pip install selenium -i 清华源, 把解压缩的浏览器驱动 chromedriver 放在python解释器所在的文件夹
超级鹰处理验证码流程
网址: www.chaojiying.com/user/login/
账号密码: siwaxinwu, 常用密码 , 959596
下载下来的demo需要提取的文件和改动的地方
用超级鹰处理超级鹰的登录
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from chaojiying import Chaojiying_Client
web = Chrome()
web.get("https://www.chaojiying.com/user/login/")
# 处理验证码
img = web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/div/img').screenshot_as_png
chaojiying = Chaojiying_Client('siwaxinwu', '', '959596')
dict_value = chaojiying.PostPic(img, 1902)
verify_code = dict_value['pic_str']
# 向页面填入用户名、密码、验证码
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input').send_keys("siwaxinwu")
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input').send_keys("dyw862749167")
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').send_keys(verify_code)
time.sleep(5)
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input').click()
while True:
time.sleep(5)
超级鹰处理12306的登录
import time
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from chaojiying import Chaojiying_Client
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
# 如果你的程序被识别到了是自动化测试工具
opt = Options()
opt.add_argument('--disable-blink-features=AutomationControlled')
web = Chrome(options=opt)
web.get("https://kyfw.12306.cn/otn/resources/login.html")
time.sleep(2)
# 切换到账号登录
web.find_element(By.XPATH, '//*[@id="toolbar_Div"]/div[2]/div[2]/ul/li[2]/a').click()
time.sleep(2)
# 识别验证码图片,超级鹰去处理
verify_img_element = web.find_element(By.XPATH, '')
chaojiying = Chaojiying_Client('siwaxinwu', '', '959596')
dict_value = chaojiying.PostPic(verify_img_element.screenshot_as_png, 9004)
verify_code = dict_value['pic_str']
re_list = verify_code.split("|")
for rs in re_list:
p_temp = rs.split(",")
x = int(p_temp[0])
y = int(p_temp[1])
# 要让鼠标移动到某一个位置,然后进行点击
ActionChains(web).move_to_element_with_offset(verify_img_element, x, y).click().perform()
# 输入账户名和密码
time.sleep(5)
web.find_element(By.XPATH, '').send_keys("")
web.find_element(By.XPATH, '').send_keys("")
# 拖拽的解决
btn = web.find_element(By.XPATH, '')
ActionChains(web).drag_and_drop_by_offset(btn, 300, 0).perform()
while True:
time.sleep(5)