mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-04 07:27:34 +00:00

Occasionally the es test can fail because the index fail to be created on the first try. Experiments show adding timeout doesn't help but add retry mitigates the issue. See history of commits in branch: yao/bump-inference-to-0.6.6 https://github.com/Unstructured-IO/unstructured/pull/1563 --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: badGarnet <badGarnet@users.noreply.github.com>
32 lines
999 B
Python
32 lines
999 B
Python
import pandas as pd
|
|
from elasticsearch import Elasticsearch
|
|
from elasticsearch.helpers import bulk
|
|
from es_cluster_config import (
|
|
CLUSTER_URL,
|
|
DATA_PATH,
|
|
INDEX_NAME,
|
|
MAPPINGS,
|
|
form_elasticsearch_doc_dict,
|
|
)
|
|
|
|
print("Connecting to the Elasticsearch cluster.")
|
|
es = Elasticsearch(CLUSTER_URL, request_timeout=30)
|
|
print(es.info())
|
|
df = pd.read_csv(DATA_PATH).dropna().reset_index()
|
|
|
|
print("Creating an Elasticsearch index for testing elasticsearch ingest.")
|
|
response = es.options(max_retries=5).indices.create(index=INDEX_NAME, mappings=MAPPINGS)
|
|
if response.meta.status != 200:
|
|
raise RuntimeError("failed to create index")
|
|
|
|
print("Loading data into the index.")
|
|
bulk_data = []
|
|
for i, row in df.iterrows():
|
|
bulk_data.append(form_elasticsearch_doc_dict(i, row))
|
|
bulk(es, bulk_data)
|
|
|
|
es.indices.refresh(index=INDEX_NAME)
|
|
response = es.cat.count(index=INDEX_NAME, format="json")
|
|
|
|
print("Succesfully created and filled an Elasticsearch index for testing elasticsearch ingest.")
|