「这是我参与2022首次更文挑战的第18天,活动详情查看:2022首次更文挑战」。
1. 爬虫简介
1.1爬虫分类
- 通用爬虫
-
- 初始化一批url
- 从这些url开始爬取,获取到对应的html,再从这些html中获取到外链,把已爬取的url放入到已爬取队列
-
- 分析获取到外链,无限循环执行第二步
- 聚焦爬虫
爬取流程与通用爬虫相似,但是聚焦爬虫针对于特定领域
1.2 Robots协议
大部分网站都有一个robotx.txt文件告诉爬虫引擎什么可以爬取
User-agent: Baiduspider #表示哪个引擎
Disallow: / #不允许爬取所有内容
User-agent: baiduspider
Disallow: /
User-agent: Baiduspider
Disallow: /baidu
Disallow: /s?
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: Googlebot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?
Disallow: /home/news/data/
Disallow: /bh
User-agent: *
Disallow: /
2.爬虫使用
2.1 urllib
是python提供的标准库 主要包含以下模块
- urllib.request 打开和读写url
- urllib.error 包含了由request引起的异常
- urllib.parse 用于解析url
- urllib.robotsparser 用于分析robots.txt 文件
2.2. 例子
from urllib.request import urlopen,Request
from urllib import parse
url = "http://cn.bing.com/search?{}".format(parse.urlencode({"q":"哈哈"}))
print(url)
req = Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36")
with urlopen(req) as res:
with open("D:/haha.html","wb+") as f:
f.write(res.read());
f.flush()
s = parse.unquote("%E5%93%88%E5%93%88");
print(s)
from urllib.request import urlopen,Request
from urllib import parse
import simplejson
from pprint import pprint
import ssl
def public(url, parmter,ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"):
"""
:param url: 要爬取的路径
:param parmter: 参数
:param ua: userAgent
:return: 返回爬取到数据
"""
#处理参数封装Request对象
dealParamter = parse.urlencode(parmter)
dealUrl = "%s?%s"%(url,dealParamter)
req = Request(dealUrl,headers={
"User-Agent":ua
})
#忽略证书不受信任
context = ssl._create_unverified_context()
try:
with urlopen(req,context) as res:
data = res.read()
print("获取数据", data)
pprint(simplejson.loads(data))
except Exception as e:
print(e)
print("爬虫出错=========")
#url
url = "https://movie.douban.com/j/search_subjects"
parameter = {"type":"movie","page_limit":"50","page_start":"0","tag":"热门"}
public(url,parameter)
2.3 requests
对urllib和urlib3进行了一些封装,提供了更强大的api
需要先安装
pip install requests
import requests
urls = ["https://www.baidu.com/s?wd=magedu","https://www.baidu.com/s?wd=magedu"]
seeesion = requests.Session()
with seeesion as s:
for url in urls:
with s.get(url,headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"}) as res:
print(res.cookies)
2.4 html解析
2.4.1 xPath
www.w3school.com.cn/xpath/index…
python使用xpath 依赖 lxml,需要安装
pip install lxml
解析豆瓣一周电影口碑榜
import requests
from lxml import etree
session = requests.Session()
with session:
with session.get("https://movie.douban.com",headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"}) as res:
content = res.text
node = etree.HTML(content)
list = node.xpath("//*[@id='billboard']//tr//a/text()")
for i in list:
print(i)
2.5 实战
获取阿里云乘风者计划文章浏览数
import requests,json,sys
import pandas as pd
import numpy as np
def use_pandas(data):
"""
使用pandas格式输出
"""
df = pd.DataFrame(data)
df = df.sort_values("pv")
sum_line = df.sum()
sum_line[:6] = np.nan
sum_line["createTime"] = "总计"
df = df.append(sum_line,ignore_index=True).drop(['articleId','userId','uccId','showCreateTime','link'],axis=1).rename(columns={"title":"文章标题","pv":"浏览量","uv":"有效浏览量(ip)"})
print(df)
def use_default(data):
"""
使用默认循环输出
"""
list = [{"pv":item['pv'],"uv":item['uv'],"title":item['title'],"status":item['status']['desc'],"time":item['createTime']} for item in data]
list.sort(key=lambda x:x['pv'],reverse=True)
print_line(list)
def print_line(data):
"""
默认方法下输出
"""
sum_pv = 0
sum_uv = 0
for i in data:
print(i["title"],i["pv"],i["uv"])
sum_pv+=i["pv"]
sum_uv+=i["uv"]
print("总计文章%d篇,总浏览量%d,实际浏览量(ip)%d" %(len(data),sum_pv,sum_uv))
def init_request():
"""
请求文章参数
"""
# url
url = "https://developer.aliyun.com/developer/api/my/article/listUserArticles"
# 参数 查询50条,如果文章超过50可以调整pageSize更大
params = {"pageNum":1,"pageSize":50}
# F12 找个请求抓一下cookie
headers = {"cookie":"cna=96urFwjx+GACASeilqd3n/R7; cps=zwYSodrSlqyGyCJI7u2gn2ebFjmv52Gp4fP3E0OUkWygO%2FVyx1gTM4B57P5xTQom; currentRegionId=cn-hangzhou; _ga=GA1.2.1090769455.1612852060; console_base_assets_version=3.28.2; aliyun_choice=CN; aliyun_lang=zh; UM_distinctid=17b28b596d46df-0f4132227e4ea4-4343363-1fa400-17b28b596d5e34; channel=d2Q0xQvU5P2I5QdQar%2FkoB8bJR34LNld96ejYso1kCk%3D; JSESSIONID=RFYJHYBV-ONVS8DOF3ZBET4ALSGTL3-8TBG37TK-MH324; maliyun_temporary_console0=1AbLByOMHeZe3G41KYd5Wf4QFzvM7jI4RWVTNJhwa5Fea5dBXFGA85maycwyW%2BBAe3JYA0hfH8R3v%2B0AolOGJxINHpbO4%2FoOQkxpozaqoVyfO7S4oYXQff4W1KUWIjI%2FaslfEhUXrEwbqP4tWO7nmQ%3D%3D; UC-XSRF-TOKEN=378a8120-c945-4225-8017-d7dd891cd8c6; _samesite_flag_=true; login_aliyunid_pk=1962712035129897; _hvn_login=6; login_aliyunid_ticket=DijB2uUHCKO2bsU9bgxuVgFRIxBof_BNpwU_TOTNChZBoeM1KJexdfb9zhYnsN5Zos6qISCrRt7mGxbigG2Cd4fWaCmBZHIzsgdZq64XXWQgyKFeuf0vpmV*s*CT58JlM_1t$w34x$TvGqKtPSlSDe4K0; login_aliyunid_luid=BG+D28Yw1O8deba817fa654d72d6835d63603e9a026+5hIJNaSTxH9ttNLvo0LUTIEzPEjtV6DsbDnZhP5w; login_aliyunid_csrf=_csrf_tk_1993530933875803; hssid=1HRiWsRr_mhsBO1tNMMnmZA1; hsite=6; aliyun_country=CN; aliyun_site=CN; login_aliyunid_abi=BG+2fCS7Siyf89dc556c87994764e280122632f5c3a+wMnKd+P+Pjmc22svOtXOd58srkX/luYMq6+Kzfr1cQpDFslOW3I=; login_aliyunid=%E5%BE%B7%E7%8E%9B%E8%A5%BF%E4%BA%9A%E6%89%80%E5%90%91%E6%8A%AB%E9%9D%A1; tfstk=cC4PBdXoeaQrru0CX4gFPZvnLlS5ZXFutElsqoLKYhV5mYoli7pKiHcsgXMOo0f..; l=eBP_xQDuOae7eMcUKOfwourza77OSIRAguPzaNbMiOCPOUXJ5e9GW63rVEtvCnGVh68kJ3SW3fIbBeYBqCYuqlAScqNvAHMmn; isg=BNfX4CVOehhmXMCFrWUKDa15ZkshHKt-XYfIfCkE86YNWPeaMOw7zpV2vvjGsIP2"}
session = requests.Session()
#请求
with session as si:
r=si.get(url,params=params,headers=headers)
r.encoding = "utf-8"
text = json.loads(r.text)
if text.get("code")=="40001":
print(r.text)
print("cookie失效,请重新抓包")
sys.exit(0)
s = text.get("data").get("list")
return s
if __name__ == '__main__':
s = init_request()
# 默认方式
use_default(s)
# pandas方式
use_pandas(s)