问题
在实际工作中,遇到了现有集群资源不满足新的需求的问题,需要将数据迁移到新的集群。
搜索资料了解了一下,数据迁移的方式主要有以下几种:
- elasticsearch-dump
- snapshot
- reindex
- logstash 因为安装插件比较麻烦,redinx又比较慢,自己写个python脚本迁移了。
脚本
# -*- coding: utf-8 -*-
"""
跨集群索引复制
Python 3.7
ES版本 6.7.1
"""
import time
from elasticsearch import Elasticsearch
from elasticsearch import helpers
# 旧es
es_cluster1 = ['127.0.0.1:9200', '127.0.0.1:9201', '127.0.0.1:9202']
es_user1 = 'user1'
es_password1 = 'pass1'
# 新es
es_cluster2 = ["127.0.0.1:9200", "127.0.0.1:9201", "127.0.0.1:9202"]
es_user2 = 'user2'
es_password2 = 'pass2'
# 跨集群索引复制
def copy_es_index(index_name, doc_type):
es1 = Elasticsearch(es_cluster1, http_auth=(es_user1, es_password1), timeout=120, port=9200)
es2 = Elasticsearch(es_cluster2, http_auth=(es_user2, es_password2), timeout=120, port=9200)
# 新es索引不存在则创建
if not es2.indices.exists(index_name):
es2.indices.create(index_name)
else:
return
cnt = 0
query_body = {
"query": {
"match_all": {}
}
}
batch_size = 10000
res = es1.search(index=index_name, body=query_body, size=batch_size, scroll='3m', timeout='5s')
scroll_id = res['_scroll_id']
result = res['hits']['hits']
actions = []
for r1 in result:
actions.append({
"_index": index_name,
"_type": doc_type,
"_id": r1['_id'],
"_source": r1['_source']
})
res_insert = helpers.bulk(client=es2, actions=actions)
if res_insert[1]:
print('插入失败:'.format(res_insert))
else:
cnt += len(actions)
print("copy {},copy docs:{}".format(index_name, cnt))
for i in range(int(res['hits']['total'] / batch_size)):
res1 = es1.scroll(scroll_id=scroll_id, scroll='3m')
result = res1['hits']['hits']
actions = []
for r1 in result:
actions.append({
"_index": index_name,
"_type": doc_type,
"_id": r1['_id'],
"_source": r1['_source']
})
res_insert = helpers.bulk(client=es2, actions=actions)
if res_insert[1]:
print('插入失败:'.format(res_insert))
else:
cnt += len(actions)
print("copy {},copy docs:{}".format(index_name, cnt))
es1.clear_scroll(scroll_id)
print('{} 复制完成,复制总文档数:{}'.format(index_name, cnt))
# 同步ES多个索引
def copy_es_index_data(index_names):
for index_name in index_names:
copy_es_index(index_name, "type")
if __name__ == "__main__":
_t1 = time.time()
all_index_name = ['test1', 'test2']
copy_es_index_data(all_index_name)
_t2 = time.time()
print('{}\n用时:{}秒'.format('_' * 100, round((_t2 - _t1), 4)))