pyhton爬虫入门

228 阅读4分钟

「这是我参与2022首次更文挑战的第18天,活动详情查看:2022首次更文挑战」。

1. 爬虫简介

1.1爬虫分类

  1. 通用爬虫
    1. 初始化一批url
    2. 从这些url开始爬取,获取到对应的html,再从这些html中获取到外链,把已爬取的url放入到已爬取队列
    1. 分析获取到外链,无限循环执行第二步
  1. 聚焦爬虫
    爬取流程与通用爬虫相似,但是聚焦爬虫针对于特定领域

1.2 Robots协议

大部分网站都有一个robotx.txt文件告诉爬虫引擎什么可以爬取

类似www.baidu.com/robots.txt

User-agent: Baiduspider #表示哪个引擎
Disallow: / #不允许爬取所有内容

User-agent: baiduspider
Disallow: /

www.taobao.com/robots.txt

User-agent: Baiduspider
Disallow: /baidu
Disallow: /s?
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh

User-agent: Googlebot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh


User-agent: *
Disallow: /

2.爬虫使用

2.1 urllib

是python提供的标准库 主要包含以下模块

  1. urllib.request 打开和读写url
  1. urllib.error 包含了由request引起的异常
  1. urllib.parse 用于解析url
  1. urllib.robotsparser 用于分析robots.txt 文件

2.2. 例子

from urllib.request import urlopen,Request
from urllib import parse

url = "http://cn.bing.com/search?{}".format(parse.urlencode({"q":"哈哈"}))
print(url)
req = Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36")
with urlopen(req) as res:
    with open("D:/haha.html","wb+") as f:
        f.write(res.read());
        f.flush()

s = parse.unquote("%E5%93%88%E5%93%88");
print(s)
from urllib.request import urlopen,Request
from urllib import parse
import simplejson
from pprint import pprint
import ssl

def public(url, parmter,ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"):
    """
        :param url: 要爬取的路径
        :param parmter: 参数
        :param ua: userAgent
        :return: 返回爬取到数据
    """

    #处理参数封装Request对象
    dealParamter = parse.urlencode(parmter)
    dealUrl = "%s?%s"%(url,dealParamter)
    req = Request(dealUrl,headers={
        "User-Agent":ua
    })

    #忽略证书不受信任
    context = ssl._create_unverified_context()

    try:
        with urlopen(req,context) as res:
            data = res.read()
            print("获取数据", data)
            pprint(simplejson.loads(data))
    except Exception as e:
        print(e)
        print("爬虫出错=========")


#url
url = "https://movie.douban.com/j/search_subjects"
parameter = {"type":"movie","page_limit":"50","page_start":"0","tag":"热门"}
public(url,parameter)

2.3 requests

对urllib和urlib3进行了一些封装,提供了更强大的api

需要先安装

pip install requests
import requests

urls = ["https://www.baidu.com/s?wd=magedu","https://www.baidu.com/s?wd=magedu"]
seeesion = requests.Session()
with seeesion as s:
    for url in urls:
        with s.get(url,headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"}) as res:
            print(res.cookies)

2.4 html解析

2.4.1 xPath

www.w3school.com.cn/xpath/index…

python使用xpath 依赖 lxml,需要安装

pip install lxml

解析豆瓣一周电影口碑榜

import requests
from lxml import etree

session = requests.Session()
with session:
    with session.get("https://movie.douban.com",headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"}) as res:
        content = res.text
        node = etree.HTML(content)
        list = node.xpath("//*[@id='billboard']//tr//a/text()")
        for i in list:
            print(i)

2.5 实战

获取阿里云乘风者计划文章浏览数

import requests,json,sys
import pandas as pd
import numpy as np

def use_pandas(data):
    """
        使用pandas格式输出
    """
    df = pd.DataFrame(data)
    df = df.sort_values("pv")
    sum_line = df.sum()
    sum_line[:6] = np.nan
    sum_line["createTime"] = "总计"
    df = df.append(sum_line,ignore_index=True).drop(['articleId','userId','uccId','showCreateTime','link'],axis=1).rename(columns={"title":"文章标题","pv":"浏览量","uv":"有效浏览量(ip)"})
    print(df)
    
def use_default(data):
    """
        使用默认循环输出
    """
    list = [{"pv":item['pv'],"uv":item['uv'],"title":item['title'],"status":item['status']['desc'],"time":item['createTime']} for item in data]
    list.sort(key=lambda x:x['pv'],reverse=True)
    print_line(list)
    
def print_line(data):
	"""
	    默认方法下输出
	"""
    sum_pv = 0
    sum_uv = 0
    for i in data:
        print(i["title"],i["pv"],i["uv"])
        sum_pv+=i["pv"]
        sum_uv+=i["uv"]
    print("总计文章%d篇,总浏览量%d,实际浏览量(ip)%d" %(len(data),sum_pv,sum_uv))

def init_request():
	"""
	    请求文章参数
	"""
    # url
    url = "https://developer.aliyun.com/developer/api/my/article/listUserArticles"
    # 参数 查询50条,如果文章超过50可以调整pageSize更大
    params = {"pageNum":1,"pageSize":50}
    # F12 找个请求抓一下cookie
    headers = {"cookie":"cna=96urFwjx+GACASeilqd3n/R7; cps=zwYSodrSlqyGyCJI7u2gn2ebFjmv52Gp4fP3E0OUkWygO%2FVyx1gTM4B57P5xTQom; currentRegionId=cn-hangzhou; _ga=GA1.2.1090769455.1612852060; console_base_assets_version=3.28.2; aliyun_choice=CN; aliyun_lang=zh; UM_distinctid=17b28b596d46df-0f4132227e4ea4-4343363-1fa400-17b28b596d5e34; channel=d2Q0xQvU5P2I5QdQar%2FkoB8bJR34LNld96ejYso1kCk%3D; JSESSIONID=RFYJHYBV-ONVS8DOF3ZBET4ALSGTL3-8TBG37TK-MH324; maliyun_temporary_console0=1AbLByOMHeZe3G41KYd5Wf4QFzvM7jI4RWVTNJhwa5Fea5dBXFGA85maycwyW%2BBAe3JYA0hfH8R3v%2B0AolOGJxINHpbO4%2FoOQkxpozaqoVyfO7S4oYXQff4W1KUWIjI%2FaslfEhUXrEwbqP4tWO7nmQ%3D%3D; UC-XSRF-TOKEN=378a8120-c945-4225-8017-d7dd891cd8c6; _samesite_flag_=true; login_aliyunid_pk=1962712035129897; _hvn_login=6; login_aliyunid_ticket=DijB2uUHCKO2bsU9bgxuVgFRIxBof_BNpwU_TOTNChZBoeM1KJexdfb9zhYnsN5Zos6qISCrRt7mGxbigG2Cd4fWaCmBZHIzsgdZq64XXWQgyKFeuf0vpmV*s*CT58JlM_1t$w34x$TvGqKtPSlSDe4K0; login_aliyunid_luid=BG+D28Yw1O8deba817fa654d72d6835d63603e9a026+5hIJNaSTxH9ttNLvo0LUTIEzPEjtV6DsbDnZhP5w; login_aliyunid_csrf=_csrf_tk_1993530933875803; hssid=1HRiWsRr_mhsBO1tNMMnmZA1; hsite=6; aliyun_country=CN; aliyun_site=CN; login_aliyunid_abi=BG+2fCS7Siyf89dc556c87994764e280122632f5c3a+wMnKd+P+Pjmc22svOtXOd58srkX/luYMq6+Kzfr1cQpDFslOW3I=; login_aliyunid=%E5%BE%B7%E7%8E%9B%E8%A5%BF%E4%BA%9A%E6%89%80%E5%90%91%E6%8A%AB%E9%9D%A1; tfstk=cC4PBdXoeaQrru0CX4gFPZvnLlS5ZXFutElsqoLKYhV5mYoli7pKiHcsgXMOo0f..; l=eBP_xQDuOae7eMcUKOfwourza77OSIRAguPzaNbMiOCPOUXJ5e9GW63rVEtvCnGVh68kJ3SW3fIbBeYBqCYuqlAScqNvAHMmn; isg=BNfX4CVOehhmXMCFrWUKDa15ZkshHKt-XYfIfCkE86YNWPeaMOw7zpV2vvjGsIP2"}
    session = requests.Session()
    #请求
    with session as si:
        r=si.get(url,params=params,headers=headers)
        r.encoding = "utf-8"
    text = json.loads(r.text)
    if text.get("code")=="40001":
        print(r.text)
        print("cookie失效,请重新抓包")
        sys.exit(0)
    s = text.get("data").get("list")
    return s

if __name__ == '__main__':
    s = init_request()
    # 默认方式
    use_default(s)
    # pandas方式
    use_pandas(s)