目标设定
爬取中国地震台网地震数据,并录入Mysql,一次全量爬取,后续增量爬取
前期准备
分析请求路径
通过访问中国地震台网查询地震数据-www.ceic.ac.cn/speedsearch ,查看网络请求可发现请求路径规律。
24小时内地震:www.ceic.ac.cn/ajax/speeds…
48小时内地震:www.ceic.ac.cn/ajax/speeds…
7日内地震:www.ceic.ac.cn/ajax/speeds…
30日内地震:www.ceic.ac.cn/ajax/speeds…
1年内地震:www.ceic.ac.cn/ajax/speeds…
可见,按照时间获取地震数据的路径为:www.ceic.ac.cn/ajax/speeds…***
猜测如下:
- 数字1代表不同的时间查询维度(1:24小时内 2:48小时内 3:7日内 4:30日内 6:1年内)
- 数字2代表页码
- callback参数含时间戳,用来调用数据展示,于我们可有可无,经测试确实如此
因而确定我们爬取的路径为:www.ceic.ac.cn/ajax/speeds…]
我们的目标是做一次全量和每日增量地震数据爬取,因而仅需请求路径:
全量 www.ceic.ac.cn/ajax/speeds…]
增量 www.ceic.ac.cn/ajax/speeds…]
分析响应报文
浏览器内输入:www.ceic.ac.cn/ajax/speeds…
响应报文为Unicode格式,通过在线工具(www.jsons.cn/unicode)进行转…
返回值为一个元组,只有一个json格式的元素,通过在线工具(www.sojson.com)进行格式化分析,有效数据为键为shuju的值,值类型为地震数据列表,列表元素有这几个键值是我们关心的:M(震级)、O_TIME(发震时刻)、EPI_LAT(震中纬度)、EPI_LON(震中经度)、EPI_DEPTH(震深)、LOCATION_C(地震位置)、id(数据唯一ID,用以后期增量判重)
数据存储规划
在Mysql的GEO_DB数据库中新建数据表EARTHQUAKE
CREATE TABLE `EARTHQUAKE` (
`EARTHQUAKE_LEVEL` float(3,1) DEFAULT NULL COMMENT '地震级别',
`EARTHQUAKE_TIME` datetime(6) DEFAULT NULL COMMENT '地震时间',
`EARTHQUAKE_LON` float(5,2) DEFAULT NULL COMMENT '震中经度',
`EARTHQUAKE_LAT` float(5,2) DEFAULT NULL COMMENT '震中纬度',
`EARTHQUAKE_DEPTH` bigint(10) DEFAULT NULL COMMENT '震中深度',
`EARTHQUAKE_ADDRESS` varchar(255) CHARACTER SET utf8 DEFAULT NULL COMMENT '震中地点',
`VERSION` datetime(6) DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP(6) COMMENT '更新时间',
`DID` varchar(20) CHARACTER SET utf8 DEFAULT NULL COMMENT '数据唯一ID'
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
Scrapy代码实现
Scrapy工作原理
项目创建
# 创建爬虫项目
>> scrapy startproject earthquake
>> cd earthquake/earthquake
# 创建全量爬虫任务
>> scrapy genspider full_mount "www.ceic.ac.cn"
# 创建增量爬虫任务
>> scrapy genspider increment "www.ceic.ac.cn"
目录结构如下
- 修改配置文件setting.py相关配置,添加Mysql连接参数
# 避开防爬虫规则
ROBOTSTXT_OBEY = False
# 打开默认爬虫任务
ITEM_PIPELINES = {
'earthquake.pipelines.EarthquakePipeline': 300,
}
# Mysql连接参数
MYSQL_HOST = '***.***.***.***'
MYSQL_PORT = **
MYSQL_USER = '***'
MYSQL_PASSWORD = '***'
MYSQL_DB = '***'
- 实现数据模型 items.py
# -*- coding: utf-8 -*-
"""
地震数据模型
"""
import scrapy
class EarthquakeItem(scrapy.Item):
earthquake_level = scrapy.Field()
earthquake_time = scrapy.Field()
earthquake_lon = scrapy.Field()
earthquake_lat = scrapy.Field()
earthquake_depth = scrapy.Field()
earthquake_address = scrapy.Field()
version = scrapy.Field()
did = scrapy.Field()
- 实现全量爬虫 full_mount.py文件
# -*- coding: utf-8 -*-
"""
全量爬取1年内全球地震数据
"""
import scrapy
from ..items import EarthquakeItem
class FullMountSpider(scrapy.Spider):
name = 'full_mount'
allowed_domains = ['www.ceic.ac.cn']
# 起始爬取路径
start_url = 'http://www.ceic.ac.cn/ajax/speedsearch?num=6&&page='
# 爬取页码循环次数
MAX_PAGE = 60
def start_requests(self):
# 遍历各页
for i in range(1, self.MAX_PAGE+1):
yield scrapy.Request('%s%d' % (self.start_url, i), callback=self.parse, dont_filter=True)
def parse(self, response):
result=eval(response.body.decode('utf-8'))
records=result['shuju']
for record in records:
item=EarthquakeItem()
item['earthquake_level']=record['M']
item['earthquake_time']=record['O_TIME']
item['earthquake_lon']=record['EPI_LON']
item['earthquake_lat']=record['EPI_LAT']
item['earthquake_depth']=record['EPI_DEPTH']
item['earthquake_address']=record['LOCATION_C']
item['did']=record['id']
yield item
- 实现增量爬虫 increment.py文件
# -*- coding: utf-8 -*-
"""
增量爬取24小时内全球地震数据
"""
import scrapy
from ..items import EarthquakeItem
class IncrementSpider(scrapy.Spider):
name = 'increment'
allowed_domains = ['www.ceic.ac.cn']
# 起始爬取路径
start_url = 'http://www.ceic.ac.cn/ajax/speedsearch?num=1&&page='
# 爬取页码循环次数
MAX_PAGE = 1
def start_requests(self):
# 遍历各页
for i in range(1, self.MAX_PAGE+1):
yield scrapy.Request('%s%d' % (self.start_url, i), callback=self.parse, dont_filter=True)
def parse(self, response):
result=eval(response.body.decode('utf-8'))
records=result['shuju']
for record in records:
item=EarthquakeItem()
item['earthquake_level']=record['M']
item['earthquake_time']=record['O_TIME']
item['earthquake_lon']=record['EPI_LON']
item['earthquake_lat']=record['EPI_LAT']
item['earthquake_depth']=record['EPI_DEPTH']
item['earthquake_address']=record['LOCATION_C']
item['did']=record['id']
yield item
- 将爬取的数据插入Mysql数据库 pipelines.py
# -*- coding: utf-8 -*-
"""
爬取数据入库Mysql
"""
import pymysql
import logging
logging.getLogger().setLevel(logging.INFO)
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
class EarthquakePipeline(object):
def open_spider(self, spider):
# Mysql连接参数
host = settings['MYSQL_HOST']
port = settings['MYSQL_PORT']
user = settings['MYSQL_USER']
password = settings['MYSQL_PASSWORD']
database = settings['MYSQL_DB']
try:
self.conn = pymysql.connect(
host=host, port=port, user=user, password=password, database=database)
logging.info('Mysql数据库连接成功。')
except Exception as e:
logging.error('Mysql数据库连接失败!')
raise e
def process_item(self, item, spider):
# 判断是否已存在记录
select_sql = 'SELECT COUNT(1) AS CNT FROM EARTHQUAKE WHERE DID=\'%s\''%(item['did'])
try:
cursor = self.conn.cursor()
cursor.execute(select_sql)
result_count,=cursor.fetchone()
if result_count>0:
logging.info('数据已存在,无需重复录入!')
return item
except Exception as e:
logging.error('数据查询错误:%s' % select_sql)
return item
# 插入新记录
insert_sql = 'INSERT INTO EARTHQUAKE(DID,EARTHQUAKE_LEVEL,EARTHQUAKE_TIME,EARTHQUAKE_LON,EARTHQUAKE_LAT,EARTHQUAKE_DEPTH,EARTHQUAKE_ADDRESS,VERSION) VALUES(\'%s\',%s,\'%s\',%s,%s,%s,\'%s\',now())' % (
item['did'], item['earthquake_level'], item['earthquake_time'], item['earthquake_lon'], item['earthquake_lat'], item['earthquake_depth'], item['earthquake_address'])
try:
cursor = self.conn.cursor()
cursor.execute(insert_sql)
logging.info('插入数据成功!')
except Exception as e:
logging.error('插入数据错误:%s' % insert_sql)
self.conn.rollback()
return item
self.conn.commit()
return item
def close_spider(self, spider):
self.conn.close()
logging.info('Mysql数据库连接已关闭。')
全量爬取执行
# 爬取一年内全部地震数据
>> scrapy crawl full_mount
增量爬取执行
# 爬取24小时内全部地震数据
>> scrapy crawl increment
爬取结果
SELECT * FROM `GEO_DB`.`EARTHQUAKE` LIMIT 1000;