爬虫11-scrapy突破反爬虫策略

466 阅读2分钟

想要改变scrapy框架的IP和User-Agent,需要改变爬虫的中间件,scrapy常用的中间件有下载中间件和Spider中间件

1、User-Agent

(1)fake-useragent的安装:pip install fake-useragent

用法:

from
fake\_useragent

import

UserAgent  
  

*

ua = UserAgent()  
  

*

print(ua.ie)  
  
  

运行结果:

Mozilla/

4.0

(compatible; MSIE

8.0

; Windows NT

6.1

; WOW64; Trident/

4.0

; SLCC2; .NET

CLR

2.0

.50727

; InfoPath

.2

)

上使IE浏览器的用法,其他的浏览器的用法

# Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US);
ua.msie
# Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)'
ua['Internet Explorer']
# Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; GTB7.4; InfoPath.2; SV1; .NET
CLR 3.3.69573; WOW64; en-US)
ua.opera
# Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11
ua.chrome
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0
Safari/537.2'
ua.google
# Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/537.13 (KHTML, like Gecko)
Chrome/24.0.1290.1 Safari/537.13
ua['google chrome']
# Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko)
Chrome/20.0.1132.57 Safari/536.11
ua.firefox
# Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1
ua.ff
# Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1
ua.safari
# Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko)
Version/6.0 Mobile/10A5355d Safari/8536.25
# and the best one, random via real world browser usage statistic
ua.random
ua.update()

(2)设置User-Agent,修改middleware.py中间件,在middleware.py添加如下代码

from
fake\_useragent

import

UserAgent  
  

*

class UserAgentMiddleware(object):  
def \_\_init\_\_(self, user\_agent=''):  
_\# print('==UserAgentMiddleware init==')_
self.ua = UserAgent()  
def process\_request(self, request, spider):  
_\# print('==UserAgentMiddleware process\_request==')_
if
self.ua:  
  

*

_\# print('\*\*\*\*\*\*\*\*Current UserAgent\*\*\*\*\*\*\*\*\*\*\*\*')_

  
  

*

print(self.ua.random)  
  

*

request.headers.setdefault(

'User-agent'

, self.ua.random)  
  
  

然后修改settings配置文件中的DOWNLOADER_MIDDLEWARES

_\# 原来的SPIDER\_MIDDLEWARES_
_\# DOWNLOADER\_MIDDLEWARES = {_
_\# 'xie.middlewares.XieSpiderMiddleware': 543,_
_\# }_
DOWNLOADER\_MIDDLEWARES = {  
_\# 'xie.middlewares.XieSpiderMiddleware': 543,_
'xie.middlewares.UserAgentMiddleware'
:

200

,  
  

*

}  
  
  

上述代码是根据源码自带的用户代理改写的,查看源码:site-package/scrapy/downloadermiddlewares/useragent.py

2、代理IP

代理IP的设置和User-Agent的设置相似,在中定义代理IP的中间件;

import
random  
  

*

from

scrapy

import

signals  
  

*

class RandomProxyMiddleware(object):  
def \_\_init\_\_(self):  
self.PROXIES = \[  
{
'ip\_port'

:

'111.8.60.9:8123'

,

'user\_passwd'

:

'user1:pass1'

},  
  

*

{

'ip\_port'

:

'101.71.27.120:80'

,

'user\_passwd'

:

'user2:pass2'

},  
  

*

{

'ip\_port'

:

'122.96.59.104:80'

,

'user\_passwd'

:

'user3:pass3'

},  
  

*

{

'ip\_port'

:

'122.224.249.122:8088'

,

'user\_passwd'

:

'user4:pass4'

},  
  

*

\]  
  

*

def process\_request(self, request, spider):  
proxy = random.choice(self.PROXIES)  
_\# 没有代理账户验证的代理使用方式_
if
proxy\['user passwd'\]

is

None

:  
  

*

request.meta\['proxy'\] =

'http://'

\+ proxy\['ip\_port'\]  
  

*

else

:  
  

*

_\# 对账户密码进行 base64 编码转换_

  
  

*

base64\_userpasswd = base64.b64encode(proxy\[

'user\_passwd'

\])  
  

*

request.headers\[

'Proxy-Authorization'

\] =

'Basic '

\+ base64\_userpasswd  
  

*

request.meta\['proxy'\] =

"http://"

\+ proxy\['ip\_port'\]  
  
  

然后修改settings配置文件中的SPIDER_MIDDLEWARES

DOWNLOADER\_MIDDLEWARES = {  
_\# 'xie.middlewares.XieSpiderMiddleware': 543,_
'xie.middlewares.UserAgentMiddleware'
:

200

,  
  

*

'xie.middlewares.RandomProxyMiddleware'

:

300

,  
  

*

}  
  
  

3、获取动态页面

from
selenium

import

webdriver  
  

*

from

scrapy.http

import

HtmlResponse  
  

*

class WebDriverMiddleware(object):  
@classmethod
def from\_crawler(cls, crawler):  
s = cls()  
crawler.signals.connect(s.spider\_opened, signal=signals.spider\_opened)  
return
s  
  

*

def process\_request(self, request, spider):  
_\# 加载驱动_
print(
'================process\_request================'

)  
  

*

browser = webdriver.PhantomJS()  
  

*

browser.get(request.url)

_\# 加载网页_

  
  

*

data = browser.page\_source

_\# 获取网页文本_

  
  

*

data = data.encode(

'utf-8'

)  
  

*

browser.quit()  
  

*

return

HtmlResponse(request.url, body=data, encoding=

'utf-8'

, request=request)  
  

*

def process\_response(self, request, response, spider):  
return
response  
  

*

def process\_exception(self, request, exception, spider):  
pass
def spider\_opened(self, spider):  
spider.logger.info(
'Spider opened: %s'

% spider.name)  
  
  

然后修改settings配置文件中的SPIDER_MIDDLEWARES

DOWNLOADER\_MIDDLEWARES = {  
_\# 'xie.middlewares.XieSpiderMiddleware': 543,_
'xie.middlewares.UserAgentMiddleware'
:

200

,  
  

*

'xie.middlewares.RandomProxyMiddleware'

:

300

,  
  

*

'xie.middlewares.WebDriverMiddleware'

:

400

,  
  

*

}  
  
  

sina博客就是一动态页面,想要获取页面的所有信息,需要将页面向下滑动……

4、禁用cookie

特殊情况下防止某些网站根据 Cookie 来封锁爬虫。

COOKIES_ENABLED = False