| 
									
										
										
										
											2021-03-08 15:16:14 -08:00
										 |  |  | # Copies indices (settings, mappings, and optionally data) from a 5 cluster to a 7 cluster. | 
					
						
							|  |  |  | # Note that when copying data, the copied is performed through this machine, meaning all data is downloaded from 5, | 
					
						
							|  |  |  | # and then uploaded to 7. This can be a very slow process if you have a lot of data, and is recommended you only do | 
					
						
							|  |  |  | # this for small indices as a result. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Requires python 3+ and elasticsearch's python lib to be installed (pip install elasticsearch). | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import argparse | 
					
						
							|  |  |  | import elasticsearch | 
					
						
							|  |  |  | import elasticsearch.helpers | 
					
						
							|  |  |  | import ssl | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | parser = argparse.ArgumentParser(description="Transfers ES indexes between clusters.") | 
					
						
							|  |  |  | parser.add_argument('-s', '--source', required=True, help='Source cluster URL and port.') | 
					
						
							|  |  |  | parser.add_argument('-d', '--dest', required=True, help='Destination cluster URL and port.') | 
					
						
							| 
									
										
										
										
											2021-03-18 19:16:44 -07:00
										 |  |  | parser.add_argument('--disable-source-ssl', required=False, action='store_true', help='If set, disable source SSL.') | 
					
						
							|  |  |  | parser.add_argument('--disable-dest-ssl', required=False, action='store_true', help='If set, disable destination SSL.') | 
					
						
							| 
									
										
										
										
											2021-03-08 15:16:14 -08:00
										 |  |  | parser.add_argument('--cert-file', required=False, default=None, help='Cert file to use with SSL.') | 
					
						
							|  |  |  | parser.add_argument('--key-file', required=False, default=None, help='Key file to use with SSL.') | 
					
						
							|  |  |  | parser.add_argument('--ca-file', required=False, default=None, help='Certificate authority file to use for SSL.') | 
					
						
							| 
									
										
										
										
											2021-03-18 19:16:44 -07:00
										 |  |  | parser.add_argument('--create-only', required=False, action='store_true', help='If set, only create the index (with settings/mappings/aliases).') | 
					
						
							| 
									
										
										
										
											2021-03-08 15:16:14 -08:00
										 |  |  | parser.add_argument('-i', '--indices', required=False, default="*", help='Regular expression for indexes to copy.') | 
					
						
							|  |  |  | parser.add_argument('--name-override', required=False, default=None, help='destination index name override') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | args = parser.parse_args() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def create_ssl_context(): | 
					
						
							|  |  |  |     if args.cert_file is None: | 
					
						
							|  |  |  |         raise Error('--cert-file is required with SSL.') | 
					
						
							|  |  |  |     if args.key_file is None: | 
					
						
							|  |  |  |         raise Error('--key-file is required with SSL.') | 
					
						
							|  |  |  |     if args.ca_file is None: | 
					
						
							|  |  |  |         raise Error('--ca-file is required with SSL.') | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     context = ssl.create_default_context( | 
					
						
							|  |  |  |         ssl.Purpose.SERVER_AUTH, | 
					
						
							|  |  |  |         cafile=args.ca_file | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     context.load_cert_chain( | 
					
						
							|  |  |  |         certfile=args.cert_file, | 
					
						
							|  |  |  |         keyfile=args.key_file | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return context | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def create_client(host, ssl_context): | 
					
						
							|  |  |  |     return elasticsearch.Elasticsearch( | 
					
						
							|  |  |  |         [host], | 
					
						
							|  |  |  |         ssl_context=ssl_context | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | class EsClients: | 
					
						
							|  |  |  |     def __init__(self, source_client, dest_client): | 
					
						
							|  |  |  |         self.source_client = source_client | 
					
						
							|  |  |  |         self.dest_client = dest_client | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_index_settings(client, pattern): | 
					
						
							|  |  |  |     indices = elasticsearch.client.IndicesClient(client).get(pattern) | 
					
						
							|  |  |  |     return indices | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def clean_settings(config): | 
					
						
							|  |  |  |     # Settings set by the server that we can read, but not write. | 
					
						
							|  |  |  |     del config['settings']['index']['provided_name'] | 
					
						
							|  |  |  |     del config['settings']['index']['version'] | 
					
						
							|  |  |  |     del config['settings']['index']['creation_date'] | 
					
						
							|  |  |  |     del config['settings']['index']['uuid'] | 
					
						
							|  |  |  |     return config | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def find_max_ngram_diff_helper(obj): | 
					
						
							|  |  |  |     # Finds the greatest diff in ngram settings and returns the value. In Elasticsearch 7, an upper bound must be | 
					
						
							|  |  |  |     # explicitly set. | 
					
						
							|  |  |  |     if not isinstance(obj, dict): | 
					
						
							|  |  |  |         return -1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     diff = -1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if 'min_gram' in obj and 'max_gram' in obj: | 
					
						
							|  |  |  |         diff = int(obj['max_gram']) - int(obj['min_gram']) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for value in obj.values(): | 
					
						
							|  |  |  |         t = find_max_ngram_diff_helper(value) | 
					
						
							|  |  |  |         diff = max(t, diff) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return diff | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def find_max_ngram_diff(config): | 
					
						
							|  |  |  |     settings = config['settings'] | 
					
						
							|  |  |  |     return find_max_ngram_diff_helper(settings) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def update_for_seven(config): | 
					
						
							|  |  |  |     # Updates settings and mappings for Elasticsearch 7. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Should only be one value in 5 - the doc type. Unwrap for 7; document types are deprecated. | 
					
						
							|  |  |  |     config['mappings'] = next(iter(config['mappings'].values())) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Need to set max_ngram_diff if any ngram diffs are more than 1. | 
					
						
							|  |  |  |     max_ngram = find_max_ngram_diff(config) | 
					
						
							|  |  |  |     if max_ngram > 1: | 
					
						
							|  |  |  |         config['settings']['index']['max_ngram_diff'] = max_ngram | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # _all is deprecated and also false by default; so not even explicitly needed... | 
					
						
							|  |  |  |     if '_all' in config['mappings']: | 
					
						
							|  |  |  |         enabled = config['mappings']['_all']['enabled'] | 
					
						
							|  |  |  |         if enabled: | 
					
						
							|  |  |  |             raise Error('_all is enabled') | 
					
						
							|  |  |  |         del config['mappings']['_all'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return config | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def create_index(client, name, config, name_override=None): | 
					
						
							|  |  |  |     name_override = name if name_override is None else name_override | 
					
						
							|  |  |  |     # Creates the given index on the client. | 
					
						
							|  |  |  |     indices_client = elasticsearch.client.IndicesClient(client) | 
					
						
							|  |  |  |     if indices_client.exists(name_override): | 
					
						
							|  |  |  |         print('WARNING: Index %s already exists!' % name_override) | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  |     indices_client.create(name_override, body=config) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | timing_samples = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Copy pasted from source code so that we can transform documents while copying | 
					
						
							|  |  |  | def reindex( | 
					
						
							|  |  |  |         client, | 
					
						
							|  |  |  |         source_index, | 
					
						
							|  |  |  |         target_index, | 
					
						
							|  |  |  |         query=None, | 
					
						
							|  |  |  |         target_client=None, | 
					
						
							|  |  |  |         chunk_size=500, | 
					
						
							|  |  |  |         scroll="5m", | 
					
						
							|  |  |  |         scan_kwargs={}, | 
					
						
							|  |  |  |         bulk_kwargs={}, | 
					
						
							|  |  |  | ): | 
					
						
							|  |  |  |     # Like the elasticsearch.helpers.reindex function, but with some custom logic. Namely, allows for source/dest | 
					
						
							|  |  |  |     # indices to be on different clusters, prints status updates, and deletes the _type field. | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     target_client = client if target_client is None else target_client | 
					
						
							|  |  |  |     docs = elasticsearch.helpers.scan(client, query=query, index=source_index, scroll=scroll, **scan_kwargs) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     start = time.time() | 
					
						
							|  |  |  |     count = 0 | 
					
						
							|  |  |  |     count_at_last_update = 0 | 
					
						
							|  |  |  |     last_print = start | 
					
						
							|  |  |  |     update_interval = 5 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def _change_doc_index(hits, index): | 
					
						
							|  |  |  |         for h in hits: | 
					
						
							|  |  |  |             h["_index"] = index | 
					
						
							|  |  |  |             if "fields" in h: | 
					
						
							|  |  |  |                 h.update(h.pop("fields")) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # TODO: Need to remove "_type" otherwise it complains about keyword becoming text? Is this legitimate? | 
					
						
							|  |  |  |             if "_type" in h: | 
					
						
							|  |  |  |                 del h["_type"] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             nonlocal count | 
					
						
							|  |  |  |             nonlocal last_print | 
					
						
							|  |  |  |             nonlocal count_at_last_update | 
					
						
							|  |  |  |             count = count + 1 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             # Use a window of samples to average over. | 
					
						
							|  |  |  |             if (time.time() - last_print) > update_interval: | 
					
						
							|  |  |  |                 timing_samples.append((count - count_at_last_update) / (time.time() - last_print)) | 
					
						
							|  |  |  |                 if len(timing_samples) > 10: | 
					
						
							|  |  |  |                     timing_samples.pop(0) | 
					
						
							|  |  |  |                 count_at_last_update = count | 
					
						
							|  |  |  |                 last_print = time.time() | 
					
						
							|  |  |  |                 print('Transferring %s docs/second. Total %s.' % (sum(timing_samples) / len(timing_samples), count)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             yield h | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     kwargs = {"stats_only": True} | 
					
						
							|  |  |  |     kwargs.update(bulk_kwargs) | 
					
						
							|  |  |  |     return elasticsearch.helpers.bulk( | 
					
						
							|  |  |  |         target_client, | 
					
						
							|  |  |  |         _change_doc_index(docs, target_index), | 
					
						
							|  |  |  |         chunk_size=chunk_size, | 
					
						
							|  |  |  |         raise_on_error=False, | 
					
						
							|  |  |  |         **kwargs | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def copy_index_data(clients, index, name_override): | 
					
						
							|  |  |  |     # Copies all documents from the source to the dest index. | 
					
						
							|  |  |  |     name_override = index if name_override is None else name_override | 
					
						
							|  |  |  |     print('Copying index %s' % index) | 
					
						
							|  |  |  |     start = time.time() | 
					
						
							|  |  |  |     res = reindex( | 
					
						
							|  |  |  |         clients.source_client, | 
					
						
							|  |  |  |         index, | 
					
						
							|  |  |  |         name_override, | 
					
						
							|  |  |  |         target_client=clients.dest_client | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     end = time.time() | 
					
						
							|  |  |  |     print('Documents written %s. Errors %s.' % res) | 
					
						
							|  |  |  |     print('Took %s seconds.' % (end - start)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def main(): | 
					
						
							| 
									
										
										
										
											2021-03-18 19:16:44 -07:00
										 |  |  |     ssl_context = create_ssl_context() if not args.disable_source_ssl or not args.disable_dest_ssl else None | 
					
						
							|  |  |  |     source_ssl_context = ssl_context if not args.disable_source_ssl else None | 
					
						
							|  |  |  |     dest_ssl_context = ssl_context if not args.disable_dest_ssl else None | 
					
						
							| 
									
										
										
										
											2021-03-08 15:16:14 -08:00
										 |  |  |     clients = EsClients(create_client(args.source, source_ssl_context), create_client(args.dest, dest_ssl_context)) | 
					
						
							|  |  |  |     indices = get_index_settings(clients.source_client, args.indices) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     def by_index(item): | 
					
						
							|  |  |  |         return item[0] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # Sort for repeatability, and to make it easy to restart part way if the script failed. | 
					
						
							|  |  |  |     indexSorted = list(indices.items()) | 
					
						
							|  |  |  |     indexSorted.sort(key=by_index) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for index, config in indexSorted: | 
					
						
							|  |  |  |         # Skip this "hidden" index that is listed for some reason. | 
					
						
							|  |  |  |         if index == '.kibana': | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         config = clean_settings(config) | 
					
						
							|  |  |  |         config = update_for_seven(config) | 
					
						
							|  |  |  |         print('Creating index %s' % (index if args.name_override is None else args.name_override)) | 
					
						
							|  |  |  |         create_index(clients.dest_client, index, config, args.name_override) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if args.create_only: | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for index, config in indexSorted: | 
					
						
							|  |  |  |         copy_index_data(clients, index, args.name_override) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | main() |