spider 开发纪要

162 阅读4分钟

1. spider爬虫学习纪要

1.0 爬虫技巧

自动生成爬虫代吗网址:curlconverter.com/

1.1 爬取百度页面然后用浏览器打开

from urllib.request import urlopen

url = "http://baidu.com"
resp = urlopen(url)
with open("baidu.html", mode="w") as f:
    f.write(resp.read().decode("utf-8"))

print("spider baidu over!")

1.2 爬取豆瓣数据

学习怎么处理反爬,爬虫以后一定要response.close()

url = "https://movie.douban.com/j/chart/top_list"

param = {
    "type": "24",
    "interval_id": "100:90",
    "action":"",
    "start":0,
    "limit":20
}
# 处理反爬
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}

response = requests.get(url = url, params = param, headers = headers)
print(response.request.url)
print(response.request.headers)
print(response.json())
response.close()

1.3 数据解析

数据解析的三种方式

  • re解析
  • bs4解析
  • xpath解析

1.3.1 正则解析

工具集: tool.oschina.net/

在线正则测试:tool.oschina.net/regex

image.png

image.png

image.png

image.png

1.3.2 正则使用

# 查找所有的
lst= re.findall(r"\d+", "我的电话号是10086,我女朋友的电话是10010")
print(lst)
print("-----------------------------")

# 匹配字符串中所有的内容,返回的是迭代器
it= re.finditer(r"\d+", "我的电话号是10086,我女朋友的电话是10010")
print(lst)
for i in it:
    print(i.group())
print("-----------------------------")

# search找到一个结果就返回,返回的是match对象
s = re.search(r"\d+", "我的电话号是10086,我女朋友的电话是10010")
print(s.group())
print("-----------------------------")
# match是从头开始匹配
s = re.match(r"\d+", "10086,我女朋友的电话是10010")
print(s.group())

print("-----------------------------")

# 预加载正则表达式
s = re.compile(r"\d+")

# (?P<dyw>.*?)可以单独正则匹配的内容中进一步提取内容
s = """
<div class = 'gj'><span id = '1'> 郭靖</span></div>
<div class = 'hr'><span id = '2'> 黄蓉</span></div>
<div class = 'dx'><span id = '3'> 东邪</span></div>
<div class = 'xd'><span id = '4'> 西毒</span></div>
<div class = 'nd'><span id = '5'> 南帝</span></div>
<div class = 'bj'><span id = '6'> 北丐</span></div>
"""
# re.S能让. 匹配换行符
obj = re.compile(r"<div class = '.*?'><span id = '\d+'>(?P<dyw>.*?)</span></div>", re.S)
result = obj.finditer(s)
for it in result:
    print(it.group("dyw"))
    print("-----")

1.3.3 xpath 使用

image.png

www.bilibili.com/video/BV1i1…

1.4 梨视频网站防盗链爬取视频

refer 就是防盗链的获取,主要还是要分析网站各个地址

import requests

url = "https://www.pearvideo.com/video_1793545"
contId = url.split("_")[1]

videoStatusUrl = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}"

headers = {
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Referer": url
}

resp = requests.get(videoStatusUrl, headers=headers)
dict_data = resp.json()
src_url = dict_data["videoInfo"]["videos"]["srcUrl"]
systemTime = dict_data["systemTime"]
src_url = src_url.replace(systemTime, f"cont-{contId}")
print(src_url)

with open("a.mp4", "wb") as f:
    f.write(requests.get(src_url).content)

1.5 爬虫代理

获取爬虫代理IP: www.zdaye.com/free/

1.6 协程爬取百度小说

import asyncio
import json

import aiohttp
import requests
import aiofiles

# 每个章节的内容
# pip install aiofiles, pip install aiohttp 

async def aiodownload(cid, b_id, title):
    data = {
        "book_id":b_id,
        "cid":f"{b_id}|{cid}",
        "need_bookinfo":1
    }
    data = json.dumps(data)
    url = f'http://dushu.baidu.com/api/pc/getChapterContent?data={data}'
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
             dic = await resp.json()
             async with aiofiles.open(title, mode='w', encoding="utf-8") as f:
                 await f.write(dic['data']['novel']['content'])



async def getCatalog(url):
    resp = requests.get(url)
    chapter_dic = resp.json()['data']['novel']['items']
    tasks = []
    for item in chapter_dic:
        cid = item["cid"]
        title = item["title"]
        tasks.append(aiodownload(cid, b_id, title))
    await asyncio.wait(tasks)

if __name__ == '__main__':
    b_id = "4306063500"
    url = 'https://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"' + b_id + '"}'
    asyncio.run(getCatalog(url))

第71个没有看到

1.7 逆向爬虫

快速的生成python爬虫文件:curlconverter.com/

1.8 python利用selenium实现大麦网抢票

juejin.cn/user/715143…

1.9 selenium学习

能不能让我的程序连接到浏览器 。 让浏览器来完成各种复杂的操作,我们只接受最终的结果 selenium:自动化测试工具, 可以:打开浏览器,然后像人一样去操作浏览器, 程序员可以从selenium中直接提取网页上的各种信息

pip install selenium -i 清华源, 把解压缩的浏览器驱动 chromedriver 放在python解释器所在的文件夹

image.png

image.png

超级鹰处理验证码流程

网址: www.chaojiying.com/user/login/

账号密码: siwaxinwu, 常用密码 , 959596

image.png

下载下来的demo需要提取的文件和改动的地方

image.png

用超级鹰处理超级鹰的登录

import time

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from chaojiying import Chaojiying_Client

web = Chrome()
web.get("https://www.chaojiying.com/user/login/")
# 处理验证码
img = web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/div/img').screenshot_as_png
chaojiying = Chaojiying_Client('siwaxinwu', '', '959596')
dict_value = chaojiying.PostPic(img, 1902)
verify_code = dict_value['pic_str']

# 向页面填入用户名、密码、验证码
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input').send_keys("siwaxinwu")
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input').send_keys("dyw862749167")
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').send_keys(verify_code)
time.sleep(5)
web.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input').click()

while True:
    time.sleep(5)

超级鹰处理12306的登录

image.png

image.png

import time

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from chaojiying import Chaojiying_Client
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
# 如果你的程序被识别到了是自动化测试工具
opt = Options()
opt.add_argument('--disable-blink-features=AutomationControlled')
web = Chrome(options=opt)
web.get("https://kyfw.12306.cn/otn/resources/login.html")
time.sleep(2)
# 切换到账号登录
web.find_element(By.XPATH, '//*[@id="toolbar_Div"]/div[2]/div[2]/ul/li[2]/a').click()
time.sleep(2)
# 识别验证码图片,超级鹰去处理
verify_img_element = web.find_element(By.XPATH, '')
chaojiying = Chaojiying_Client('siwaxinwu', '', '959596')
dict_value = chaojiying.PostPic(verify_img_element.screenshot_as_png, 9004)
verify_code = dict_value['pic_str']
re_list = verify_code.split("|")
for rs in re_list:
    p_temp = rs.split(",")
    x = int(p_temp[0])
    y = int(p_temp[1])
    # 要让鼠标移动到某一个位置,然后进行点击
    ActionChains(web).move_to_element_with_offset(verify_img_element, x, y).click().perform()
# 输入账户名和密码
time.sleep(5)
web.find_element(By.XPATH, '').send_keys("")
web.find_element(By.XPATH, '').send_keys("")
# 拖拽的解决
btn = web.find_element(By.XPATH, '')
ActionChains(web).drag_and_drop_by_offset(btn, 300, 0).perform()

while True:
    time.sleep(5)