# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class FirproItem(scrapy.Item):
# define the fields for your item here like:# name = scrapy.Field()#定义保存岗位的名称的字段
name = scrapy.Field()
#反馈概率
percent = scrapy.Field()
#发布公司
company = scrapy.Field()
#岗位月薪
salary = scrapy.Field()
#工作地点
position = scrapy.Field()
####2.在spiders创建fir_spider.py文件
# -*- coding: utf-8 -*-
import scrapy
#自定义的爬虫程序处理类,要继承scrapy模块的spider类型
class Firspider(scrapy.Spider):
#定义爬虫程序的名称,用于程序的启动使用
name = 'firspider'#定义爬虫程序运行的作用域--域名
allow_domains = 'http://sou.zhaopin.com'#定义爬虫程序真实爬取url地址的列表/原组
start_urls = ('http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E4%B8%8A%E6%B5%B7&kw=python&sm=0&p=1&source=0',)
#定义爬虫获取到的响应数据处理类#response就是爬取程序获取的数据
def parse(self,response):
with open(u'智联.html','w') as f:
f.write(response.body)
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class FirproItem(scrapy.Item):
# define the fields for your item here like:# name = scrapy.Field()#定义保存岗位的名称的字段
name = scrapy.Field()
#反馈概率
percent = scrapy.Field()
#发布公司
company = scrapy.Field()
#岗位月薪
salary = scrapy.Field()
#工作地点
position = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class FirproPipeline(object):
def __init__(self):
self.file=open('zhilian.json','w')
def process_item(self, item, spider):
text = json.dumps(dict(item),ensure_ascii=False)
self.file.write(text.encode('utf-8'))
print'-----------------'
def close_spider(self,spider):
self.file.close()
#return item
fir_spider.py
# -*- coding: utf-8 -*-
import scrapy
from firPro.items import FirproItem
import re
#自定义的爬虫程序处理类,要继承scrapy模块的spider类型
class Firspider(scrapy.Spider):
#定义正则匹配,把匹配到的数据进行替换
reg = re.compile('\s*')
#定义爬虫程序的名称,用于程序的启动使用
name = 'firspider'#定义爬虫程序运行的作用域--域名
allow_domains = 'http://sou.zhaopin.com'#定义爬虫程序真实爬取url地址的列表/原组
url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E4%B8%8A%E6%B5%B7&kw=python&sm=0&source=0&sg=b8e8fb4080fa47afa69cd683dfbfccf9&p='
p = 1
start_urls = [url + str(p)]
def parse(self, response):
# print (response.body)#获取所匹配的岗位
job_list= response.xpath('//div[@id="newlist_list_div"]//table')[2:]
for job in job_list:
#创建一个Item对象,用于存放匹配的目标数据
item = FirproItem()
name =job.xpath(".//tr[1]//td[1]//a")
# name = self.reg.sub('', job.xpath(".//td[1]//a/text()[1]").extract())
item["name"] = self.reg.sub('',name.xpath("string(.)").extract()[0])
item["percent"] = job.xpath(".//td[2]//span[1]/text()").extract()
item["company"] = job.xpath(".//td[3]//a/text()").extract()
item["salary"] = job.xpath(".//td[4]/text()").extract()
item["position"] = job.xpath(".//td[5]/text()").extract()
# 将数据提交给模块pipelines处理
yield item
if self.p<=10:
self.p+=1
yield scrapy.Request(self.url + str(self.p),callback=self.parse)
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class ZhycItem(scrapy.Item):
# define the fields for your item here like:# name = scrapy.Field()# 定义需要封装的字段
name = scrapy.Field()
publish = scrapy.Field()
company = scrapy.Field()
require = scrapy.Field()
salary = scrapy.Field()
desc = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class ZhycPipeline(object):
def __init__(self):
self.file = open("zhonghuayingcai.json", "w")
def process_item(self, item, spider):
text = json.dumps(dict(item), ensure_ascii=False)
self.file.write(text.encode("utf-8"))
print"*****************************************"#return item
def close_spider(self, spider):
self.file.close()
# -*- coding: utf-8 -*-
import scrapy
class ZlItem(scrapy.Item):
# define the fields for your item here like:# name = scrapy.Field()#岗位名称
name = scrapy.Field()
#反馈率
percent = scrapy.Field()
#公司名称
company = scrapy.Field()
#职位月薪
salary = scrapy.Field()
#工作地点
position = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
class ZlPipeline(object):
def __init__(self):
self.file = open("sdzp.json", "w")
def process_item(self, item, spider):
text = json.dumps(dict(item), ensure_ascii=False)
self.file.write(text.encode("utf-8"))
#return item
def close_spider(self, spider):
self.file.close()
zlzp.py
# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from zl.items import ZlItem
class ZlzpSpider(CrawlSpider):
name = 'sdzpspider'
allowed_domains = ['zhaopin.com']
start_urls = ['http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%e4%b8%8a%e6%b5%b7&kw=python&sm=0&source=0&sg=936e2219abfb4f07a17009a930d54a37&p=1']
#定义超链接的提取规则
page_link = LinkExtractor(allow=('&sg=936e2219abfb4f07a17009a930d54a37&p=\d+'))
#定义爬虫爬取数据的规则
rules=[
Rule(page_link,callback='parse_content',follow=True)
]
#定义处理函数
def parse_content(self, response):
#获取整个我们需要的数据区域
job_list = response.xpath('//div[@id="newlist_list_content_table"]//table//tr[1]')
for job in job_list:
#定义一个item,用于存放目标数据
item = ZlItem()
name = job.xpath(".//td[1]//a")
if len(name)>0:
item['name'] = name.xpath('string(.)').extract()[0]
percent = job.xpath('.//td[2]//span/text()')
if len(percent)>0:
item['percent']=percent.extract()[0]
company = job.xpath(".//td[3]//a[1]/text()")
if len(company) > 0:
item["company"] = company.extract()[0]
salary = job.xpath(".//td[4]/text()")
if len(salary) > 0:
item["salary"] = salary.extract()[0]
position = job.xpath(".//td[5]/text()")
if len(position) > 0:
item["position"] = position.extract()[0]
yield item