ElasticSearch跨集群数据迁移——索引复制python脚本

780 阅读1分钟

问题

在实际工作中,遇到了现有集群资源不满足新的需求的问题,需要将数据迁移到新的集群。

搜索资料了解了一下,数据迁移的方式主要有以下几种:

  • elasticsearch-dump
  • snapshot
  • reindex
  • logstash 因为安装插件比较麻烦,redinx又比较慢,自己写个python脚本迁移了。

脚本

# -*- coding: utf-8 -*-
"""
跨集群索引复制
Python 3.7
ES版本 6.7.1
"""
import time
from elasticsearch import Elasticsearch
from elasticsearch import helpers

# 旧es
es_cluster1 = ['127.0.0.1:9200', '127.0.0.1:9201', '127.0.0.1:9202']
es_user1 = 'user1'
es_password1 = 'pass1'
# 新es
es_cluster2 = ["127.0.0.1:9200", "127.0.0.1:9201", "127.0.0.1:9202"]
es_user2 = 'user2'
es_password2 = 'pass2'


# 跨集群索引复制
def copy_es_index(index_name, doc_type):
    es1 = Elasticsearch(es_cluster1, http_auth=(es_user1, es_password1), timeout=120, port=9200)
    es2 = Elasticsearch(es_cluster2, http_auth=(es_user2, es_password2), timeout=120, port=9200)
    # 新es索引不存在则创建
    if not es2.indices.exists(index_name):
        es2.indices.create(index_name)
    else:
        return

    cnt = 0
    query_body = {
        "query": {
            "match_all": {}
        }
    }
    batch_size = 10000
    res = es1.search(index=index_name, body=query_body, size=batch_size, scroll='3m', timeout='5s')
    scroll_id = res['_scroll_id']
    result = res['hits']['hits']
    actions = []
    for r1 in result:
        actions.append({
            "_index": index_name,
            "_type": doc_type,
            "_id": r1['_id'],
            "_source": r1['_source']
        })
    res_insert = helpers.bulk(client=es2, actions=actions)
    if res_insert[1]:
        print('插入失败:'.format(res_insert))
    else:
        cnt += len(actions)
        print("copy {},copy docs:{}".format(index_name, cnt))
    for i in range(int(res['hits']['total'] / batch_size)):
        res1 = es1.scroll(scroll_id=scroll_id, scroll='3m')
        result = res1['hits']['hits']
        actions = []
        for r1 in result:
            actions.append({
                "_index": index_name,
                "_type": doc_type,
                "_id": r1['_id'],
                "_source": r1['_source']
            })
        res_insert = helpers.bulk(client=es2, actions=actions)
        if res_insert[1]:
            print('插入失败:'.format(res_insert))
        else:
            cnt += len(actions)
            print("copy {},copy docs:{}".format(index_name, cnt))
    es1.clear_scroll(scroll_id)
    print('{} 复制完成,复制总文档数:{}'.format(index_name, cnt))


# 同步ES多个索引
def copy_es_index_data(index_names):
    for index_name in index_names:
        copy_es_index(index_name, "type")


if __name__ == "__main__":
    _t1 = time.time()
    all_index_name = ['test1', 'test2']
    copy_es_index_data(all_index_name)
    _t2 = time.time()
    print('{}\n用时:{}秒'.format('_' * 100, round((_t2 - _t1), 4)))