1. spider爬虫学习纪要

1.0 爬虫技巧

自动生成爬虫代吗网址：curlconverter.com/

1.1 爬取百度页面然后用浏览器打开

from urllib.request import urlopen

url = "http://baidu.com"
resp = urlopen(url)
with open("baidu.html", mode="w") as f:
    f.write(resp.read().decode("utf-8"))

print("spider baidu over!")

1.2 爬取豆瓣数据

学习怎么处理反爬，爬虫以后一定要response.close()

url = "https://movie.douban.com/j/chart/top_list"

param = {
    "type": "24",
    "interval_id": "100:90",
    "action":"",
    "start":0,
    "limit":20
}
# 处理反爬
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}

response = requests.get(url = url, params = param, headers = headers)
print(response.request.url)
print(response.request.headers)
print(response.json())
response.close()

1.3 数据解析

数据解析的三种方式

re解析
bs4解析
xpath解析

1.3.1 正则解析

工具集： tool.oschina.net/

在线正则测试：tool.oschina.net/regex

1.3.2 正则使用

# 查找所有的
lst= re.findall(r"\d+", "我的电话号是10086，我女朋友的电话是10010")
print(lst)
print("-----------------------------")

# 匹配字符串中所有的内容，返回的是迭代器
it= re.finditer(r"\d+", "我的电话号是10086，我女朋友的电话是10010")
print(lst)
for i in it:
    print(i.group())
print("-----------------------------")

# search找到一个结果就返回，返回的是match对象
s = re.search(r"\d+", "我的电话号是10086，我女朋友的电话是10010")
print(s.group())
print("-----------------------------")
# match是从头开始匹配
s = re.match(r"\d+", "10086，我女朋友的电话是10010")
print(s.group())

print("-----------------------------")

# 预加载正则表达式
s = re.compile(r"\d+")

# (?P<dyw>.*?)可以单独正则匹配的内容中进一步提取内容
s = """
<div class = 'gj'><span id = '1'> 郭靖</span></div>
<div class = 'hr'><span id = '2'> 黄蓉</span></div>
<div class = 'dx'><span id = '3'> 东邪</span></div>
<div class = 'xd'><span id = '4'> 西毒</span></div>
<div class = 'nd'><span id = '5'> 南帝</span></div>
<div class = 'bj'><span id = '6'> 北丐</span></div>
"""
# re.S能让. 匹配换行符
obj = re.compile(r"<div class = '.*?'><span id = '\d+'>(?P<dyw>.*?)</span></div>", re.S)
result = obj.finditer(s)
for it in result:
    print(it.group("dyw"))
    print("-----")

1.3.3 xpath 使用

www.bilibili.com/video/BV1i1…

1.4 梨视频网站防盗链爬取视频

refer 就是防盗链的获取，主要还是要分析网站各个地址

import requests

url = "https://www.pearvideo.com/video_1793545"
contId = url.split("_")[1]

videoStatusUrl = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}"

headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Referer": url
}

resp = requests.get(videoStatusUrl, headers=headers)
dict_data = resp.json()
src_url = dict_data["videoInfo"]["videos"]["srcUrl"]
systemTime = dict_data["systemTime"]
src_url = src_url.replace(systemTime, f"cont-{contId}")
print(src_url)

with open("a.mp4", "wb") as f:
    f.write(requests.get(src_url).content)

1.5 爬虫代理

获取爬虫代理IP： www.zdaye.com/free/

1.6 协程爬取百度小说

import asyncio
import json

import aiohttp
import requests
import aiofiles

# 每个章节的内容
# pip install aiofiles, pip install aiohttp 

async def aiodownload(cid, b_id, title):
    data = {
        "book_id":b_id,
        "cid":f"{b_id}|{cid}",
        "need_bookinfo":1
    }
    data = json.dumps(data)
    url = f'http://dushu.baidu.com/api/pc/getChapterContent?data={data}'
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
             dic = await resp.json()
             async with aiofiles.open(title, mode='w', encoding="utf-8") as f:
                 await f.write(dic['data']['novel']['content'])



async def getCatalog(url):
    resp = requests.get(url)
    chapter_dic = resp.json()['data']['novel']['items']
    tasks = []
    for item in chapter_dic:
        cid = item["cid"]
        title = item["title"]
        tasks.append(aiodownload(cid, b_id, title))
    await asyncio.wait(tasks)

if __name__ == '__main__':
    b_id = "4306063500"
    url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + b_id + '"}'
    asyncio.run(getCatalog(url))

第71个没有看到

1.7 逆向爬虫

快速的生成python爬虫文件：curlconverter.com/

1.8 python利用selenium实现大麦网抢票

juejin.cn/user/715143…

1.9 selenium学习

能不能让我的程序连接到浏览器。让浏览器来完成各种复杂的操作，我们只接受最终的结果 selenium:自动化测试工具, 可以:打开浏览器，然后像人一样去操作浏览器, 程序员可以从selenium中直接提取网页上的各种信息

pip install selenium -i 清华源, 把解压缩的浏览器驱动 chromedriver 放在python解释器所在的文件夹

超级鹰处理验证码流程

网址： www.chaojiying.com/user/login/

账号密码： siwaxinwu，常用密码， 959596

下载下来的demo需要提取的文件和改动的地方

用超级鹰处理超级鹰的登录

import time

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from chaojiying import Chaojiying_Client

web = Chrome()
web.get("https://www.chaojiying.com/user/login/")
# 处理验证码
img = web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/div/img').screenshot_as_png
chaojiying = Chaojiying_Client('siwaxinwu', '', '959596')
dict_value = chaojiying.PostPic(img, 1902)
verify_code = dict_value['pic_str']

# 向页面填入用户名、密码、验证码
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input').send_keys("siwaxinwu")
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input').send_keys("dyw862749167")
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').send_keys(verify_code)
time.sleep(5)
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input').click()

while True:
    time.sleep(5)

超级鹰处理12306的登录

import time

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from chaojiying import Chaojiying_Client
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
# 如果你的程序被识别到了是自动化测试工具
opt = Options()
opt.add_argument('--disable-blink-features=AutomationControlled')
web = Chrome(options=opt)
web.get("https://kyfw.12306.cn/otn/resources/login.html")
time.sleep(2)
# 切换到账号登录
web.find_element(By.XPATH, '//*[@id="toolbar_Div"]/div[2]/div[2]/ul/li[2]/a').click()
time.sleep(2)
# 识别验证码图片，超级鹰去处理
verify_img_element = web.find_element(By.XPATH, '')
chaojiying = Chaojiying_Client('siwaxinwu', '', '959596')
dict_value = chaojiying.PostPic(verify_img_element.screenshot_as_png, 9004)
verify_code = dict_value['pic_str']
re_list = verify_code.split("|")
for rs in re_list:
    p_temp = rs.split(",")
    x = int(p_temp[0])
    y = int(p_temp[1])
    # 要让鼠标移动到某一个位置，然后进行点击
    ActionChains(web).move_to_element_with_offset(verify_img_element, x, y).click().perform()
# 输入账户名和密码
time.sleep(5)
web.find_element(By.XPATH, '').send_keys("")
web.find_element(By.XPATH, '').send_keys("")
# 拖拽的解决
btn = web.find_element(By.XPATH, '')
ActionChains(web).drag_and_drop_by_offset(btn, 300, 0).perform()

while True:
    time.sleep(5)

spider 开发纪要

1. spider爬虫学习纪要

1.0 爬虫技巧

1.1 爬取百度页面然后用浏览器打开

1.2 爬取豆瓣数据

1.3 数据解析

1.3.1 正则解析

1.3.2 正则使用

1.3.3 xpath 使用

1.4 梨视频网站防盗链爬取视频

1.5 爬虫代理

1.6 协程爬取百度小说

1.7 逆向爬虫

1.8 python利用selenium实现大麦网抢票

1.9 selenium学习