此数据爬取仅作学习研究用,严禁用做商业用途
目标设定
爬取IT桔子死亡公司、投资机构、LP、GP、基金机构数据,并录入MongoDB,全量爬取并判重。
前期准备
分析请求路径
通过Chrome进行抓包分析和测试,获得以下获取数据的以下路径:
死亡公司
请求路径:www.itjuzi.com/api/closure 请求方法:GET 请求参数:page 页码
投资机构
请求路径:www.itjuzi.com/api/investm… 请求方法:POST 请求参数:page 页码
LP
请求路径:www.itjuzi.com/api/lp 请求方法:POST 请求参数:page 页码
GP
请求路径:www.itjuzi.com/api/gp 请求方法:POST 请求参数:page 页码
基金机构
请求路径:www.itjuzi.com/api/fund 请求方法:POST 请求参数:page 页码
分析响应报文
此处,使用的EOLINKER做的API测试,以死亡公司为例:
info内的数据都是我们想要的数据
数据存储规划
爬取的数据都存入MongoDB的itorange数据库内,数据集名分别为death_company、gp、lp、fund、investments,即爬虫任务的名称,数据记录为各公司机构详细信息。
代码实现
死亡公司、投资机构、LP、GP、基金机构五类数据,分别配置一个独立的爬虫任务,并传入独立的处理管道。
由于爬虫任务中请求方法和参数处理不一样,故一个爬虫任务一个文件;处理管道除具体处理不一样,都需要连接数据库和打印始末日志,故可创建基类,各处理管道重写基类方法即可。
此外,为避免频繁访问被封,每次请求前暂停随机秒数。
主要代码文件实现如下:
setting.py 配置文件
# MongoDB数据库连接
MONGO_HOST = '106.13.73.198'
MONGO_PORT = 31000
MONGO_USER = 'root'
MONGO_PASSWORD = '@wjbd'
MONGO_DB = 'itorange'
# 绕过防爬规则
ROBOTSTXT_OBEY = False
items.py 数据模型文件
# -*- coding: utf-8 -*-
import scrapy
# 死亡公司字段模型
class DeathCompanyItem(scrapy.Item):
company_info = scrapy.Field()
# 投资机构字段模型
class InvestmentItem(scrapy.Item):
investment_info = scrapy.Field()
# LP字段模型
class LpItem(scrapy.Item):
lp_info = scrapy.Field()
# GP字段模型
class GpItem(scrapy.Item):
gp_info = scrapy.Field()
# 基金机构字段模型
class FundItem(scrapy.Item):
fund_info = scrapy.Field()
spider.py 爬虫任务文件
# -*- coding: utf-8 -*-
"""
死亡公司库爬取
"""
from ..items import DeathCompanyItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)
# 死亡公司库爬虫
class DeathCompanySpider(scrapy.Spider):
name = 'death_company'
allowed_domains = ['www.itjuzi.com']
# API接口路径
start_url = 'https://www.itjuzi.com/api/closure'
# 页码循环次数
MAX_PAGE = 630
# 停歇时间,防止频繁访问封IP
idle_time=random.randint(0,5)
# 传入对应的处理管道
custom_settings={
'ITEM_PIPELINES':{'itorange.pipelines.DeathCompanyPipeline':300}
}
# 传入页码参数
def start_requests(self):
# 稍作停歇,防止被封
time.sleep(self.idle_time)
for i in range(1,self.MAX_PAGE+1):
data = dict()
data['page']=i
yield scrapy.Request(url=self.start_url,meta=data, callback=self.parse,dont_filter=True)
# 解析响应报文
def parse(self, response):
# 解析全部报文
try:
result=json.loads(response.body)
except Exception as e:
logging.error('返回报文解析错误!')
return
# 判断响应是否成功
result_code=result['code']
if result_code!=200:
return
# 提取死亡公司信息
try:
death_company_item=DeathCompanyItem()
company_info=result['data']['info']
death_company_item['company_info']=company_info
yield death_company_item
except Exception as e:
logging.error('未发现死亡公司信息!')
return
# -*- coding: utf-8 -*-
"""
投资机构数据爬取
"""
from ..items import InvestmentItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)
# 投资机构爬虫
class InvestmentsSpider(scrapy.Spider):
name = 'investments'
allowed_domains = ['www.itjuzi.com']
# API接口路径
start_url = 'https://www.itjuzi.com/api/investments'
# 页码循环次数
MAX_PAGE = 1
# 停歇时间,防止频繁访问封IP
idle_time=random.randint(0,5)
# 传入对应的处理管道
custom_settings = {
'ITEM_PIPELINES': {'itorange.pipelines.InvestmentPipeline': 300}
}
# 传入页码参数
def start_requests(self):
# 稍作停歇,防止被封
time.sleep(self.idle_time)
for i in range(1, self.MAX_PAGE+1):
data = dict()
data['page'] = i
yield scrapy.Request(url=self.start_url,method='POST', meta=data, callback=self.parse, dont_filter=True)
# 解析响应报文
def parse(self, response):
print(response.headers)
# 解析全部报文
try:
result = json.loads(response.body)
except Exception as e:
logging.error('返回报文解析错误!')
return
# 判断响应是否成功
result_code = result['code']
if result_code != 200:
return
# 提取投资机构信息
try:
investmen_item = InvestmentItem()
investment_info = result['data']['data']
investmen_item['investment_info'] = investment_info
yield investmen_item
except Exception as e:
logging.error('未发现投资机构信息!')
return
# -*- coding: utf-8 -*-
"""
GP数据爬取
"""
from ..items import GpItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)
class GpSpider(scrapy.Spider):
name = 'gp'
allowed_domains = ['www.itjuzi.com']
# API接口路径
start_url = 'https://www.itjuzi.com/api/gp'
# 页码循环次数
MAX_PAGE = 805
# 停歇时间,防止频繁访问封IP
idle_time=random.randint(0,5)
# 传入对应的处理管道
custom_settings = {
'ITEM_PIPELINES': {'itorange.pipelines.GpPipeline': 300}
}
# 传入页码参数
def start_requests(self):
# 稍作停歇,防止被封
time.sleep(self.idle_time)
for i in range(1, self.MAX_PAGE+1):
data = dict()
data['page'] = i
yield scrapy.Request(url=self.start_url,method='POST', meta=data, callback=self.parse, dont_filter=True)
# 解析响应报文
def parse(self, response):
print(response.headers)
# 解析全部报文
try:
result = json.loads(response.body)
except Exception as e:
logging.error('返回报文解析错误!')
return
# 判断响应是否成功
result_code = result['code']
if result_code != 200:
return
# 提取GP信息
try:
gp_item = GpItem()
gp_info = result['data']['list']
gp_item['gp_info'] = gp_info
yield gp_item
except Exception as e:
logging.error('未发现GP信息!')
return
# -*- coding: utf-8 -*-
"""
LP数据爬取
"""
from ..items import LpItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)
class LpSpider(scrapy.Spider):
name = 'lp'
allowed_domains = ['www.itjuzi.com']
# API接口路径
start_url = 'https://www.itjuzi.com/api/lp'
# 页码循环次数
MAX_PAGE = 1
# 停歇时间,防止频繁访问封IP
idle_time=random.randint(0,5)
# 传入对应的处理管道
custom_settings = {
'ITEM_PIPELINES': {'itorange.pipelines.LpPipeline': 300}
}
# 传入页码参数
def start_requests(self):
# 稍作停歇,防止被封
time.sleep(self.idle_time)
for i in range(1, self.MAX_PAGE+1):
data = dict()
data['page'] = i
yield scrapy.Request(url=self.start_url,method='POST', meta=data, callback=self.parse, dont_filter=True)
# 解析响应报文
def parse(self, response):
print(response.headers)
# 解析全部报文
try:
result = json.loads(response.body)
except Exception as e:
logging.error('返回报文解析错误!')
return
# 判断响应是否成功
result_code = result['code']
if result_code != 200:
return
# 提取LP信息
try:
lp_item = LpItem()
lp_info = result['data']['data']
lp_item['lp_info'] = lp_info
yield lp_item
except Exception as e:
logging.error('未发现LP信息!')
return
# -*- coding: utf-8 -*-
"""
基金机构数据爬取
"""
from ..items import FundItem
import scrapy
import json
import random
import time
import logging
logging.getLogger().setLevel(logging.INFO)
class FundSpider(scrapy.Spider):
name = 'fund'
allowed_domains = ['www.itjuzi.com']
# API接口路径
start_url = 'https://www.itjuzi.com/api/fund'
# 页码循环次数
MAX_PAGE = 2500
# 停歇时间,防止频繁访问封IP
idle_time=random.randint(0,5)
# 传入对应的处理管道
custom_settings = {
'ITEM_PIPELINES': {'itorange.pipelines.FundPipeline': 300}
}
# 传入页码参数
def start_requests(self):
# 稍作停歇,防止被封
time.sleep(self.idle_time)
for i in range(1, self.MAX_PAGE+1):
data = dict()
data['page'] = i
yield scrapy.Request(url=self.start_url,method='POST', meta=data, callback=self.parse, dont_filter=True)
# 解析响应报文
def parse(self, response):
print(response.headers)
# 解析全部报文
try:
result = json.loads(response.body)
except Exception as e:
logging.error('返回报文解析错误!')
return
# 判断响应是否成功
result_code = result['code']
if result_code != 200:
return
# 提取GP信息
try:
fund_item = FundItem()
fund_info = result['data']['list']
fund_item['fund_info'] = fund_info
yield fund_item
except Exception as e:
logging.error('未发现基金机构信息!')
return
pipelines.py 处理管道文件
# -*- coding: utf-8 -*-
"""
分管道处理不同的爬虫任务
"""
import logging
import pymongo
from scrapy.utils.project import get_project_settings
# 读取默认配置文件
settings = get_project_settings()
# 设置日志输出级别
logging.getLogger().setLevel(logging.INFO)
# 封装MongoDB文档操作
class Mongo:
@classmethod
def getDoc(cls):
db_name = settings['MONGO_DB']
cls.client = pymongo.MongoClient(
settings['MONGO_HOST'], settings['MONGO_PORT'])
try:
cls.db = cls.client.admin.authenticate(
settings['MONGO_USER'], settings['MONGO_PASSWORD'])
logging.info('MongoDB密码验证成功!')
if cls.db:
logging.info('MongoDB连接成功!')
return cls.client[db_name]
else:
logging.error('MongoDB连接失败!')
return None
except Exception as e:
logging.info('MongoDB密码验证失败:%s' % (e))
# 基础任务管道类
class BasePipeline(object):
# 建立数据库连接,选择相应数据集
def open_spider(self, spider):
logging.info('==================当前爬虫任务:%s' % spider.name)
self.doc = Mongo.getDoc()
self.collection = self.doc[spider.name]
logging.info('%s文档已创建,准备写入!' % spider.name)
# 爬虫结束
def close_spider(self, spider):
logging.info('=======爬虫任务:%s结束!' % spider.name)
# 死亡公司库任务
class DeathCompanyPipeline(BasePipeline):
# 处理爬虫数据
def process_item(self, item, spider):
info_list = item['company_info']
for info in info_list:
# 根据公司ID判重
com_id = info['com_id']
result = self.collection.find({'com_id': com_id})
if len(list(result)) > 0:
logging.info('数据已存在,无需插入!')
continue
# 不重复则插入
try:
self.collection.insert_one(info)
logging.info('已写入%s!' % spider.name)
except Exception as e:
logging.error('写入出错:%s' % (e))
return item
# 投资机构任务
class InvestmentPipeline(BasePipeline):
# 处理爬虫数据
def process_item(self, item, spider):
info_list = item['investment_info']
for info in info_list:
# 根据ID判重
invetment_id = info['id']
result = self.collection.find({'id': invetment_id})
if len(list(result)) > 0:
logging.info('数据已存在,无需插入!')
continue
# 不重复则插入
try:
self.collection.insert_one(info)
logging.info('已写入%s!' % spider.name)
except Exception as e:
logging.error('写入出错:%s' % (e))
return item
# LP任务
class LpPipeline(BasePipeline):
# 处理爬虫数据
def process_item(self, item, spider):
info_list = item['lp_info']
for info in info_list:
# 根据ID判重
lp_id = info['id']
result = self.collection.find({'id': lp_id})
if len(list(result)) > 0:
logging.info('数据已存在,无需插入!')
continue
# 不重复则插入
try:
self.collection.insert_one(info)
logging.info('已写入%s!' % spider.name)
except Exception as e:
logging.error('写入出错:%s' % (e))
return item
# GP任务
class GpPipeline(BasePipeline):
# 处理爬虫数据
def process_item(self, item, spider):
info_list = item['gp_info']
for info in info_list:
# 根据ID判重
gp_id = info['id']
result = self.collection.find({'id': gp_id})
if len(list(result)) > 0:
logging.info('数据已存在,无需插入!')
continue
# 不重复则插入
try:
self.collection.insert_one(info)
logging.info('已写入%s!' % spider.name)
except Exception as e:
logging.error('写入出错:%s' % (e))
return item
# 基金机构任务
class FundPipeline(BasePipeline):
# 处理爬虫数据
def process_item(self, item, spider):
info_list = item['fund_info']
for info in info_list:
# 根据FUND_ID判重
fund_id = info['fund_id']
result = self.collection.find({'fund_id': fund_id})
if len(list(result)) > 0:
logging.info('数据已存在,无需插入!')
continue
# 不重复则插入
try:
self.collection.insert_one(info)
logging.info('已写入%s!' % spider.name)
except Exception as e:
logging.error('写入出错:%s' % (e))
return item
任务执行
# 死亡公司库爬取
> scrapy crawl death_company
# 投资机构爬取
> scrapy crawl investments
# GP爬取
> scrapy crawl gp
# LP爬取
> scrapy crawl lp
# 基金机构爬取
> scrapy crawl fund