初始
安装
1 装载虚拟环境
pip virtualenv
可以在任意位置创建虚拟环境
virtualenv 文件夹名
2 装载scrapy
进入虚拟环境中,
pip install scrapy
pip install pypiwin32
插件
pip install Pillow (图片保存)
pip install MySQL_python-1.2.5-cp27-none-win32.whl
爬取到的文件存到csv文件中
scrapy crawl wy -o one.csv
wy 是蜘蛛名 one.csv是csv文件名
基本应用
1 创建scrapy项目
在虚拟环境中进入想要创建项目的文件夹
scrapy startproject 项目名
2 在项目中添加应用小蜘蛛
进入 项目里面的spider
scrapy genspider 蜘蛛名 作用域网址
(作用域网址不用写http)
3 根据模板创建
scrapy genspider -t crawl 蜘蛛名 作用域网址
常用方法
html.replace('\n','')
替换 将所有的换行替换为空
html.strip()
去空 去除空格
f =lambda x: x[0] if x else ''
匹配0位,如初没有,返回空
''.join(uncode字符)
把unicode字符集转为字符串(可以转成中文)
写入json文件
import json
class CnBlogJsonPipeline(object):
def __init__(self):
self.f = open('cnblog.json', 'w')
def process_item(self, item, spider):
self.f.write(json.dumps(dict(item), ensure_ascii=False).encode('utf-8') + '\n')
return item
def close_spider(self, spider):
self.f.close()
常用函数
def getValue(self, data):
return data[0] if data else ''
def format(self, data):
return data.strip().replace('\n','')
def getNumber(self, data):
num_re = re.compile(r'\d+')
res = num_re.search(data)
if res is not None:
return int(res.group())
else:
return 0
链接数据库
同步
import MySQLdb
class MySqlPipeline(object):
def __init__(self):
try:
self.conn = MySQLdb.connect('127.0.0.1','root','123456','temp', charset='utf8')
self.cursor = self.conn.cursor()
except Exception, e:
print '数据库连接失败'
print str(e)
def process_item(self, item, spider):
print dict(item)
sql = 'insert into tencent(title,location,type1,num,zhize,yaoqiu) ' \
'values("%s","%s","%s","%s","%s","%s") on duplicate key update ' \
'title=values(title)'
try:
self.cursor.execute(sql, (item['title'],item['location'],item['type1'],item['num'],item['zhize'],item['yaoqiu']))
self.conn.commit()
except Exception, e:
print '插入失败',str(e)
return item
异步
在setting最后写入 :
MYHOST = '127.0.0.1'
MYUSER = 'root'
MYPASSWORD = '123456'
MYDB = 'temp'
class YiysqlPipeline(object):
def __init__(self,dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
db_config = dict(
host = settings['MYHOST'],
user = settings['MYUSER'],
passwd = settings['MYPASSWORD'],
db = settings['MYDB'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
)
dbpool = adbapi.ConnectionPool('MySQLdb',**db_config)
return cls(dbpool)
def process_item(self,item,spider):
query = self.dbpool.runInteraction(self.insert,item)
query.addErrback(self.handle_error)
return item
def insert(self,cursor,item):
sql = 'insert into tencent(title,location,type1,num,zhize,yaoqiu) ' \
'values(%s,%s,%s,%s,%s,%s) on duplicate key update ' \
'title=values(title)'
cursor.execute(sql,(item['title'],item['location'],item['type1'],item['num'],item['zhize'],item['yaoqiu']))
#错误处理函数
def handle_error(self,error):
print str(error)
下载图片
设置setting
ITEM_PIPELINES = {
'py02_scrapy_day11.pipelines.CnBlogJsonPipeline': 1,
'py02_scrapy_day11.pipelines.CnblogImagePipeline': 2,
'py02_scrapy_day11.pipelines.CnblogMysqlPipeline': 3,
}
IMAGES_URLS_FIELD = 'image_url'
base_dir = os.path.dirname(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(base_dir,'images')
from scrapy.pipelines.images import ImagesPipeline
class CnblogImagePipeline(ImagesPipeline):
def item_completed(self, results, item, info):
status = results[0][0]
if status:
item['image_path'] = results[0][1]['path']
else:
item['image_path'] = ''
return item
模拟登陆的三种
1 麻烦办法,但管用
1.简单粗暴,但是笨方法(浏览器登陆以后,提取cookie,然后加到咱们程序里边)
一定会解决问题
import scrapy
class RenrenSpider(scrapy.Spider):
name = 'renren1'
allowed_domains = ['renren.com']
headers = {
"Host": "www.renren.com",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
"Upgrade-Insecure-Requests": "1",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8",
}
cookies = {
"anonymid": "j8wbzvzt91qfk6",
"_de": "31795CAA550243A1FFFC179CCE3D61136DEBB8C2103DE356",
"p": "1f30ef50aa4ced5b21c1c7989630c6f20",
"first_login_flag": "1",
"ln_uact": "1752570559@qq.com",
"ln_hurl": "http: // head.xiaonei.com / photos / 0 / 0 / men_main.gif",
"t": "5cbb6e214588f37be606a59589ede26e0",
"societyguester": "5cbb6e214588f37be606a59589ede26e0",
"id": "440906810",
"xnsid": "d300c830",
"loginfrom": "syshome",
}
def start_requests(self):
start_url = 'http://www.renren.com'
yield scrapy.Request(url=start_url, headers=self.headers, callback=self.parse,cookies=self.cookies)
def parse(self, response):
print response.body
2 适用于简单网页
2.直接发送账号密码,post
适合简单界面,能分析出登陆地址和参数
import scrapy
class Renren2Spider(scrapy.Spider):
name = 'renren2'
allowed_domains = ['renren.com']
def start_requests(self):
login_url = 'http://www.renren.com/PLogin.do'
data = {
"email" : '1752570559@qq.com',
"password" : '1234qwer',
}
yield scrapy.FormRequest(url=login_url,formdata=data,callback=self.after_login)
def after_login(self, response):
print response.body
3 最麻烦,豆瓣(验证码)
.获取登陆页面,提取认证信息,和账户密码一起post到指定url
案例:豆瓣
import scrapy
import urllib
class DoubanSpider(scrapy.Spider):
name = 'douban'
allowed_domains = ['douban.com']
header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"}
def start_requests(self):
login_url = 'https://accounts.douban.com/login'
yield scrapy.Request(login_url,callback=self.parse)
def parse(self, response):
captcha_solution = response.xpath('//img[@id="captcha_image"]/@src').extract()
if captcha_solution:
print '''
有验证码,准备输入
'''
captcha_url = captcha_solution[0]
print captcha_url
urllib.urlretrieve(captcha_url, 'douban.jpg')
captcha_solution = raw_input('请输入验证码,本地查看douban.jpg:')
data = {
'form_email': '1752570559@qq.com',
'form_password': '1234qwer',
'captcha-solution': captcha_solution,
}
yield scrapy.FormRequest.from_response(
response,
headers=self.header,
formdata=data,
callback=self.after_login,
)
else:
print '无验证码,直接登陆'
def after_login(self, response):
title = response.xpath('//title/text()').extract()[0]
print title
if u'登录' in title:
print '登陆失败'
else:
print '登陆成功'
print response.body
'''
处理登陆以后的业务逻辑
'''
下载中间件
随机UserAgent
建立一个 mymiddleware 文件,与item同位
在settings 设置 !!开启中间件
USER_AGENS = [
'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)',
'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3'
'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; IEMobile/7.0; LG; GW910)'
'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3'
]
在mymiddleware 文件代码里
from settings import USER_AGENS,PROXIES
import random
import base64
class RandomUserAgent(object):
def process_request(self,request,spider):
user_agent = random.choice(USER_AGENS)
request.headers.setdefault('User-Agent',user_agent)
随机更换代理IP
在settings 里 !!开启中间件
PROXIES = [
{'host' : '111.200.58.94:80'},
{'host' : '116.62.128.50:16816', 'auth' : '1752570559:wd0p04kd'}
]
在mymiddleware 文件代码里
class RandomProxy(object):
def process_request(self,request , spider):
proxy = random.choice(PROXIES)
if proxy.get('auth') is None:
request.meta['proxy'] = 'http://' + proxy['host']
else :
auth = base64.b64encode(proxy['auth'])
request.headers['Proxy-Authorization'] = 'Basic ' + auth
request.meta['proxy'] = 'http://' + proxy['host']
中间件
DOWNLOADER_MIDDLEWARES = {
'py02_spider_day13.mymiddlewares.RandomUserAgent': 1,
'py02_spider_day13.mymiddlewares.RandomProxy': 2,
}
一次运行多个爬虫
http://blog.csdn.net/mingzznet/article/details/51315572
selenium
import scrapy
from selenium import webdriver
class ZhihuSpider(scrapy.Spider):
name = "zhihu"
allowed_domains = ["zhihu.com"]
start_urls = (
'https://www.zhihu.com/',
)
def get_cookies(self):
driver = webdriver.Chrome()
driver.get(self.start_urls[0])
driver.find_element_by_link_text(u"登录").click()
driver.find_element_by_name("account").clear()
driver.find_element_by_name("account").send_keys("your username")
driver.find_element_by_name("password").clear()
driver.find_element_by_name("password").send_keys("keys")
SignInURL = u"https://www.zhihu.com/#signin"
try:
if driver.find_element_by_id('captcha'):
while True:
if not SignInURL == driver.current_url:
break
pass
pass
finally:
if SignInURL == driver.current_url:
driver.find_element_by_css_selector("button.sign-button.submit").click()
cookies = driver.get_cookies()
driver.close()
print cookies
return cookies
def after_login(self, response):
sel = scrapy.Selector(response)
for i in range(1,10):
xml = r'//*[@id="feed-%d"]/div[1]/div[2]/div[2]/h2/a/text()' %(i)
titile = sel.xpath(xml).extract()
if len(titile):
print str(titile[0])
def parse(self, response):
return scrapy.Request(url=self.start_urls[0], cookies=self.get_cookies(), callback=self.after_login)
改爬虫为分布式
在spider里
把start_urls = ['http://www.mzitu.com/']改为
redis_key = 'meizitu:start_urls'
头为:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
import hashlib
import re,os
import requests
class CeshiSpider(RedisCrawlSpider):
setting 里:
ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeline': 999,
}
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
在mian文件同级创建一个链接数据库的
import json
import redis
import MySQLdb
def main():
try:
rediscli = redis.StrictRedis(host='39.106.37.83', port=6379, db=0)
mysqlcli = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', db = 'temp', port=3306, charset='utf8')
print '连接成功'
except Exception,e:
print '数据库连接失败'
print str(e)
exit()
while True:
source, data = rediscli.blpop(["lp:items"])
item = json.loads(data)
try:
cur = mysqlcli.cursor()
sql = "insert ignore into liepin(url,company,position,salary,location,work_years,degree,position_type,tags,pub_date,position_desc,work_address) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
% (item['url'], item['company'], item['position'], item['salary'],item['location'],item['work_years'],item['degree'],item['position_type'],item['tags'],item['pub_date'],item['position_desc'],item['work_address'])
cur.execute(sql)
mysqlcli.commit()
cur.close()
print "inserted %s" % item['company']
except Exception,e:
print '插入失败'
print str(e)
if __name__ == '__main__':
main()
redis
redis-cli
lpush lp:start_urls 'http:
错误
Unhandled error in Deferred
环境依赖不对,看看同事pycharm的环境,全部对照装一遍