Scrapy的基础学习pip install MySQL_python-1.2.5-cp27-none-win32.sc

初始

安装

1 装载虚拟环境
    pip virtualenv
    可以在任意位置创建虚拟环境
    virtualenv 文件夹名

2 装载scrapy
    进入虚拟环境中, 
    pip install scrapy 
    pip install pypiwin32

插件

pip install Pillow  (图片保存)
pip install MySQL_python-1.2.5-cp27-none-win32.whl

爬取到的文件存到csv文件中

scrapy crawl wy -o one.csv  
wy 是蜘蛛名   one.csv是csv文件名

基本应用

1 创建scrapy项目
    在虚拟环境中进入想要创建项目的文件夹
    scrapy startproject  项目名
2 在项目中添加应用小蜘蛛
    进入 项目里面的spider
    scrapy genspider 蜘蛛名  作用域网址
    (作用域网址不用写http)

3   根据模板创建
 scrapy genspider -t crawl 蜘蛛名 作用域网址

常用方法

html.replace('\n','')
替换  将所有的换行替换为空
html.strip()
去空  去除空格
f =lambda x: x[0] if x else ''
匹配0位,如初没有,返回空

''.join(uncode字符)
把unicode字符集转为字符串(可以转成中文)

写入json文件

import json

# 写入json
class CnBlogJsonPipeline(object):
    def __init__(self):
        self.f = open('cnblog.json', 'w')

    def process_item(self, item, spider):
        self.f.write(json.dumps(dict(item), ensure_ascii=False).encode('utf-8') + '\n')
        # 返回数据对象,交给其它的管道文件进行处理
        return item

    def close_spider(self, spider):
        self.f.close()

常用函数

    #没法匹配到结果,则返回空
    def getValue(self, data):
        return data[0] if data else ''

    #格式化字符串
    def format(self, data):
        return data.strip().replace('\n','')

    #匹配数字
    def getNumber(self, data):
        num_re = re.compile(r'\d+')
        res = num_re.search(data)
        if res is not None:
            return int(res.group())
        else:
            return 0

链接数据库

同步

import MySQLdb

#同步写入mysql
class MySqlPipeline(object):
    # 初始化
    def __init__(self):
        try:
            self.conn = MySQLdb.connect('127.0.0.1','root','123456','temp', charset='utf8')
            self.cursor = self.conn.cursor()
        except Exception, e:
            print '数据库连接失败'
            print str(e)

    def process_item(self, item, spider):
        print dict(item)
        sql = 'insert into tencent(title,location,type1,num,zhize,yaoqiu) ' \
              'values("%s","%s","%s","%s","%s","%s") on duplicate key update ' \
              'title=values(title)'

        # print sql
        try:
            self.cursor.execute(sql, (item['title'],item['location'],item['type1'],item['num'],item['zhize'],item['yaoqiu']))
            self.conn.commit()
        except Exception, e:
            print '插入失败',str(e)
        return item

异步

在setting最后写入 :
    # mysql连接属性
    MYHOST = '127.0.0.1'
    MYUSER = 'root'
    MYPASSWORD = '123456'
    MYDB = 'temp'



#异步写入mysql
class YiysqlPipeline(object):
    def __init__(self,dbpool):
        self.dbpool = dbpool

    #方法名是固定的
    @classmethod #静态方法
    def from_settings(cls,settings):
        db_config = dict(
            host = settings['MYHOST'],
            user = settings['MYUSER'],
            passwd = settings['MYPASSWORD'],
            db = settings['MYDB'],
            charset = 'utf8',
            cursorclass = MySQLdb.cursors.DictCursor,

        )
        #数据库链接池
        dbpool = adbapi.ConnectionPool('MySQLdb',**db_config)
        return cls(dbpool)
    def process_item(self,item,spider):
        #异步插入操作
        query = self.dbpool.runInteraction(self.insert,item)
        query.addErrback(self.handle_error)
        return item

    # 插入操作
    def insert(self,cursor,item):
        sql = 'insert into tencent(title,location,type1,num,zhize,yaoqiu) ' \
              'values(%s,%s,%s,%s,%s,%s) on duplicate key update ' \
              'title=values(title)'
        cursor.execute(sql,(item['title'],item['location'],item['type1'],item['num'],item['zhize'],item['yaoqiu']))

    #错误处理函数
    def handle_error(self,error):
        print str(error)

下载图片

设置setting
ITEM_PIPELINES = {
   # 'py02_scrapy_day11.pipelines.Py02ScrapyDay11Pipeline': 300,
   'py02_scrapy_day11.pipelines.CnBlogJsonPipeline': 1,
   'py02_scrapy_day11.pipelines.CnblogImagePipeline': 2,
   'py02_scrapy_day11.pipelines.CnblogMysqlPipeline': 3,
    # 图片处理管道
    # 'scrapy.pipelines.images.ImagesPipeline' : 2
}
IMAGES_URLS_FIELD = 'image_url' # 指定从哪个字段提取图片链接
base_dir = os.path.dirname(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(base_dir,'images') # 设置图片存放位置

from scrapy.pipelines.images import ImagesPipeline 

# 保存图片的Pipeline
class CnblogImagePipeline(ImagesPipeline):
    # def process_item(self, item, spider):
    #     pass
    def item_completed(self, results, item, info):
        # 图片处理结果
        status = results[0][0]
        if status:
            item['image_path'] = results[0][1]['path']
        else:
            item['image_path'] = ''

        return item

模拟登陆的三种

1 麻烦办法,但管用

    1.简单粗暴，但是笨方法（浏览器登陆以后，提取cookie，然后加到咱们程序里边）
      一定会解决问题


  # -*- coding: utf-8 -*-
import scrapy

class RenrenSpider(scrapy.Spider):
    name = 'renren1'
    allowed_domains = ['renren.com']
    # start_urls = ['http://www.renren.com/440906810']
    headers = {
        "Host": "www.renren.com",
        "Connection": "keep-alive",
        # "Cache-Control": "max-age=0",
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
        "Upgrade-Insecure-Requests": "1",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "zh-CN,zh;q=0.8",
    }
    cookies = {
        "anonymid": "j8wbzvzt91qfk6",
        "_de": "31795CAA550243A1FFFC179CCE3D61136DEBB8C2103DE356",
        "p": "1f30ef50aa4ced5b21c1c7989630c6f20",
        "first_login_flag": "1",
        "ln_uact": "1752570559@qq.com",
        "ln_hurl": "http: // head.xiaonei.com / photos / 0 / 0 / men_main.gif",
        "t": "5cbb6e214588f37be606a59589ede26e0",
        "societyguester": "5cbb6e214588f37be606a59589ede26e0",
        "id": "440906810",
        "xnsid": "d300c830",
        "loginfrom": "syshome",
    }

    # 重写第一次请求处理函数，要返回Request对象
    def start_requests(self):
        start_url = 'http://www.renren.com'
        yield scrapy.Request(url=start_url, headers=self.headers, callback=self.parse,cookies=self.cookies)

    def parse(self, response):
        print response.body

2 适用于简单网页

2.直接发送账号密码，post
        适合简单界面，能分析出登陆地址和参数


# -*- coding: utf-8 -*-
import scrapy

class Renren2Spider(scrapy.Spider):
    name = 'renren2'
    allowed_domains = ['renren.com']
    def start_requests(self):
        login_url = 'http://www.renren.com/PLogin.do'
        # 发送post请求
        data = {
            "email" : '1752570559@qq.com',
            "password" : '1234qwer',
        }
        yield scrapy.FormRequest(url=login_url,formdata=data,callback=self.after_login)

    #登陆以后处理
    def after_login(self, response):
        print response.body

3 最麻烦,豆瓣(验证码)

.获取登陆页面，提取认证信息，和账户密码一起post到指定url
        案例：豆瓣


 # -*- coding: utf-8 -*-
import scrapy
import urllib


class DoubanSpider(scrapy.Spider):
    name = 'douban'
    allowed_domains = ['douban.com']
    # start_urls = ['https://accounts.douban.com/login']
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"}  # 供登录模拟使用

    def start_requests(self):
        login_url = 'https://accounts.douban.com/login'
        yield scrapy.Request(login_url,callback=self.parse)

    def parse(self, response):
        captcha_solution = response.xpath('//img[@id="captcha_image"]/@src').extract()

        # 判断是否存在验证码
        if captcha_solution:
            print '''
                有验证码，准备输入
            '''
            captcha_url = captcha_solution[0]
            print captcha_url
            urllib.urlretrieve(captcha_url, 'douban.jpg')
            captcha_solution = raw_input('请输入验证码,本地查看douban.jpg：')

            # 构造表单数据
            data = {
                'form_email': '1752570559@qq.com',
                'form_password': '1234qwer',
                'captcha-solution': captcha_solution,
            }
            # 自动提取表单post地址
            yield scrapy.FormRequest.from_response(
                response,
                headers=self.header,
                formdata=data,
                callback=self.after_login,
            )

        else:
            print '无验证码，直接登陆'

    def after_login(self, response):
        title = response.xpath('//title/text()').extract()[0]
        print title
        if u'登录' in title:
            print '登陆失败'
        else:
            print '登陆成功'
            print response.body
            '''
            处理登陆以后的业务逻辑
            '''

下载中间件

随机UserAgent

建立一个 mymiddleware 文件,与item同位

在settings 设置  !!开启中间件

USER_AGENS = [
    'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3'    
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows Phone OS 7.0; Trident/3.1; IEMobile/7.0; LG; GW910)'
    'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3'
]


在mymiddleware 文件代码里

#coding:utf8
from settings import USER_AGENS,PROXIES
import random
import base64

# 随机更换浏览器身份中间件
class RandomUserAgent(object):
    def process_request(self,request,spider):
        user_agent = random.choice(USER_AGENS)
        request.headers.setdefault('User-Agent',user_agent)

随机更换代理IP

在settings 里   !!开启中间件

PROXIES = [
    # {'host' : '61.160.208.222:8080'},
    {'host' : '111.200.58.94:80'},
    # 付费代理
    {'host' : '116.62.128.50:16816', 'auth' : '1752570559:wd0p04kd'}
]

在mymiddleware 文件代码里

# 随机更换代理ip
class RandomProxy(object):
    def process_request(self,request , spider):
        proxy = random.choice(PROXIES) # 随机选出一个代理
        if proxy.get('auth') is None: # 免费代理
            request.meta['proxy'] = 'http://' + proxy['host']

        else : # 收费代理
            auth = base64.b64encode(proxy['auth'])
            request.headers['Proxy-Authorization'] = 'Basic ' + auth
            request.meta['proxy'] = 'http://' + proxy['host']

中间件

# 下载中间件
DOWNLOADER_MIDDLEWARES = {
   'py02_spider_day13.mymiddlewares.RandomUserAgent': 1,
   'py02_spider_day13.mymiddlewares.RandomProxy': 2,
}

一次运行多个爬虫

http://blog.csdn.net/mingzznet/article/details/51315572

selenium

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver



class ZhihuSpider(scrapy.Spider):
    name = "zhihu"
    allowed_domains = ["zhihu.com"]
    start_urls = (
        'https://www.zhihu.com/',
    )



    def get_cookies(self):
        driver = webdriver.Chrome()
        driver.get(self.start_urls[0])
        driver.find_element_by_link_text(u"登录").click()
        driver.find_element_by_name("account").clear()
        driver.find_element_by_name("account").send_keys("your username") #修改为自己的用户名
        driver.find_element_by_name("password").clear()
        driver.find_element_by_name("password").send_keys("keys") #修改为自己的密码
        SignInURL = u"https://www.zhihu.com/#signin"
        try:
            if driver.find_element_by_id('captcha'):
                while True:
                    if not SignInURL == driver.current_url:
                        break
                    pass
                pass
        finally:
            if SignInURL == driver.current_url:
                driver.find_element_by_css_selector("button.sign-button.submit").click()
            cookies = driver.get_cookies()
            driver.close()
            print cookies
            return cookies

    def after_login(self, response):
        sel = scrapy.Selector(response)
        #print response.body
        for i in range(1,10):
            xml = r'//*[@id="feed-%d"]/div[1]/div[2]/div[2]/h2/a/text()' %(i)
            titile = sel.xpath(xml).extract()
            if len(titile):
                print str(titile[0])

    def parse(self, response):
        return scrapy.Request(url=self.start_urls[0], cookies=self.get_cookies(), callback=self.after_login)

改爬虫为分布式

在spider里
       把start_urls = ['http://www.mzitu.com/']改为
       redis_key = 'meizitu:start_urls'
   头为:
   # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy_redis.spiders import RedisCrawlSpider
    import hashlib
    import re,os
    import requests
    class CeshiSpider(RedisCrawlSpider):

setting 里:

ITEM_PIPELINES = {
        'scrapy_redis.pipelines.RedisPipeline': 999,  # 数据统一存到redis服务器上的 管道文件

}


# ---------------------------------scrapy-redis-----------------------------------
# url 过滤 用scrapy_redis
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# 调度器改成 scrapy-redis 调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 可以暂停
SCHEDULER_PERSIST = True

# 请求队列模式
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" # 优先级
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"  # 队列
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"  # 栈  先进后出

# LOG_LEVEL = 'DEBUG'
# redis服务器的 ip地址和端口号
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379

在mian文件同级创建一个链接数据库的



#coding:utf8
import json
import redis
import MySQLdb

def main():
    # 指定redis数据库信息
    try:
        rediscli = redis.StrictRedis(host='39.106.37.83', port=6379, db=0)
        # 指定mysql数据库
        mysqlcli = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', db = 'temp', port=3306, charset='utf8')
        print '连接成功'
    except Exception,e:
        print '数据库连接失败'
        print str(e)
        exit()

    while True:
        source, data = rediscli.blpop(["lp:items"])
        # print source # redis里的键
        # print data # 返回的数据
        item = json.loads(data)

        try:
            # 使用cursor()方法获取操作游标
            cur = mysqlcli.cursor()
            # 使用execute方法执行SQL INSERT语句
            sql = "insert ignore into liepin(url,company,position,salary,location,work_years,degree,position_type,tags,pub_date,position_desc,work_address) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
                   % (item['url'], item['company'], item['position'], item['salary'],item['location'],item['work_years'],item['degree'],item['position_type'],item['tags'],item['pub_date'],item['position_desc'],item['work_address'])

            cur.execute(sql)
            # 提交sql事务
            mysqlcli.commit()
            #关闭本次操作
            cur.close()
            print "inserted %s" % item['company']
        except Exception,e:
            print '插入失败'
            print str(e)

if __name__ == '__main__':
    main()

redis

redis-cli 
lpush lp:start_urls 'http://www.baidu.com'

错误

Unhandled error in Deferred
环境依赖不对,看看同事pycharm的环境,全部对照装一遍