mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
feat: add opensearch source and destination connector (#2349)
Adds OpenSearch as a source and destination. Since OpenSearch is a fork of Elasticsearch, these connectors rely heavily on inheriting the Elasticsearch connectors whenever possible. - Adds OpenSearch source connector to be able to ingest documents from OpenSearch. - Adds OpenSearch destination connector to be able to ingest documents from any supported source, embed them and write the embeddings / documents into OpenSearch. - Defines an example unstructured elements schema for users to be able to setup their unstructured OpenSearch indexes easily. --------- Co-authored-by: potter-potter <david.potter@gmail.com>
This commit is contained in:
parent
d7f4c24e21
commit
bc791d53f4
@ -1,4 +1,4 @@
|
||||
## 0.12.1-dev9
|
||||
## 0.12.1-dev10
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
|
||||
### Features
|
||||
* **MongoDB Source Connector.** New source connector added to all CLI ingest commands to support downloading/partitioning files from MongoDB.
|
||||
* **Add OpenSearch source and destination connectors.** OpenSearch, a fork of Elasticsearch, is a popular storage solution for various functionality such as search, or providing intermediary caches within data pipelines. Feature: Added OpenSearch source connector to support downloading/partitioning files. Added OpenSearch destination connector to be able to ingest documents from any supported source, embed them and write the embeddings / documents into OpenSearch.
|
||||
|
||||
### Fixes
|
||||
|
||||
|
@ -12,6 +12,7 @@ include requirements/ingest-wikipedia.in
|
||||
include requirements/ingest-google-drive.in
|
||||
include requirements/ingest-gcs.in
|
||||
include requirements/ingest-elasticsearch.in
|
||||
include requirements/ingest-opensearch.in
|
||||
include requirements/ingest-dropbox.in
|
||||
include requirements/ingest-box.in
|
||||
include requirements/ingest-onedrive.in
|
||||
|
4
Makefile
4
Makefile
@ -179,6 +179,10 @@ install-ingest-wikipedia:
|
||||
install-ingest-elasticsearch:
|
||||
python3 -m pip install -r requirements/ingest/elasticsearch.txt
|
||||
|
||||
.PHONY: install-ingest-opensearch
|
||||
install-ingest-opensearch:
|
||||
python3 -m pip install -r requirements/ingest/opensearch.txt
|
||||
|
||||
.PHONY: install-ingest-confluence
|
||||
install-ingest-confluence:
|
||||
python3 -m pip install -r requirements/ingest/confluence.txt
|
||||
|
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
EMBEDDING_PROVIDER=${EMBEDDING_PROVIDER:-"langchain-huggingface"}
|
||||
|
||||
unstructured-ingest \
|
||||
local \
|
||||
--input-path example-docs/book-war-and-peace-1225p.txt \
|
||||
--output-dir local-output-to-opensearch \
|
||||
--strategy fast \
|
||||
--chunk-elements \
|
||||
--embedding-provider "$EMBEDDING_PROVIDER" \
|
||||
--num-processes 4 \
|
||||
--verbose \
|
||||
opensearch \
|
||||
--hosts "$OPENSEARCH_HOSTS" \
|
||||
--username "$OPENSEARCH_USERNAME" \
|
||||
--password "$OPENSEARCH_PASSWORD" \
|
||||
--index-name "$OPENSEARCH_INDEX_NAME" \
|
||||
--num-processes 2
|
@ -0,0 +1,62 @@
|
||||
import os
|
||||
|
||||
from unstructured.ingest.connector.elasticsearch import (
|
||||
ElasticsearchWriteConfig,
|
||||
)
|
||||
from unstructured.ingest.connector.local import SimpleLocalConfig
|
||||
from unstructured.ingest.connector.opensearch import (
|
||||
OpenSearchAccessConfig,
|
||||
SimpleOpenSearchConfig,
|
||||
)
|
||||
from unstructured.ingest.interfaces import (
|
||||
ChunkingConfig,
|
||||
EmbeddingConfig,
|
||||
PartitionConfig,
|
||||
ProcessorConfig,
|
||||
ReadConfig,
|
||||
)
|
||||
from unstructured.ingest.runner import LocalRunner
|
||||
from unstructured.ingest.runner.writers.base_writer import Writer
|
||||
from unstructured.ingest.runner.writers.opensearch import (
|
||||
OpenSearchWriter,
|
||||
)
|
||||
|
||||
|
||||
def get_writer() -> Writer:
|
||||
return OpenSearchWriter(
|
||||
connector_config=SimpleOpenSearchConfig(
|
||||
access_config=OpenSearchAccessConfig(
|
||||
hosts=os.getenv("OPENSEARCH_HOSTS"),
|
||||
username=os.getenv("OPENSEARCH_USERNAME"),
|
||||
password=os.getenv("OPENSEARCH_PASSWORD"),
|
||||
),
|
||||
index_name=os.getenv("OPENSEARCH_INDEX_NAME"),
|
||||
),
|
||||
write_config=ElasticsearchWriteConfig(
|
||||
batch_size_bytes=15_000_000,
|
||||
num_processes=2,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
writer = get_writer()
|
||||
runner = LocalRunner(
|
||||
processor_config=ProcessorConfig(
|
||||
verbose=True,
|
||||
output_dir="local-output-to-opensearch",
|
||||
num_processes=2,
|
||||
),
|
||||
connector_config=SimpleLocalConfig(
|
||||
input_path="example-docs/book-war-and-peace-1225p.txt",
|
||||
),
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(),
|
||||
chunking_config=ChunkingConfig(chunk_elements=True),
|
||||
embedding_config=EmbeddingConfig(
|
||||
provider="langchain-huggingface",
|
||||
),
|
||||
writer=writer,
|
||||
writer_kwargs={},
|
||||
)
|
||||
runner.run()
|
@ -8,7 +8,7 @@
|
||||
"analyzer": "english"
|
||||
},
|
||||
"type": {
|
||||
"type": "keyword"
|
||||
"type": "text"
|
||||
},
|
||||
"embeddings": {
|
||||
"type": "dense_vector",
|
||||
|
@ -0,0 +1,152 @@
|
||||
{"settings": {
|
||||
"index": {
|
||||
"knn": true,
|
||||
"knn.algo_param.ef_search": 100
|
||||
}
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"element_id": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"text": {
|
||||
"type": "text",
|
||||
"analyzer": "english"
|
||||
},
|
||||
"type": {
|
||||
"type": "text"
|
||||
},
|
||||
"embeddings": {
|
||||
"type": "knn_vector",
|
||||
"dimension": 384
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"category_depth": {
|
||||
"type": "integer"
|
||||
},
|
||||
"parent_id": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"attached_to_filename": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"filetype": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"last_modified": {
|
||||
"type": "date"
|
||||
},
|
||||
"file_directory": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"filename": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"data_source": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
},
|
||||
"version": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"date_created": {
|
||||
"type": "date"
|
||||
},
|
||||
"date_modified": {
|
||||
"type": "date"
|
||||
},
|
||||
"date_processed": {
|
||||
"type": "date"
|
||||
},
|
||||
"record_locator": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"permissions_data": {
|
||||
"type": "object"
|
||||
}
|
||||
}
|
||||
},
|
||||
"coordinates": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"system": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"layout_width": {
|
||||
"type": "float"
|
||||
},
|
||||
"layout_height": {
|
||||
"type": "float"
|
||||
},
|
||||
"points": {
|
||||
"type": "float"
|
||||
}
|
||||
}
|
||||
},
|
||||
"languages": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"page_number": {
|
||||
"type": "integer"
|
||||
},
|
||||
"page_name": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"url": {
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
},
|
||||
"links": {
|
||||
"type": "object"
|
||||
},
|
||||
"link_urls": {
|
||||
"type": "text"
|
||||
},
|
||||
"link_texts": {
|
||||
"type": "text"
|
||||
},
|
||||
"sent_from": {
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
},
|
||||
"sent_to": {
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
},
|
||||
"subject": {
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
},
|
||||
"section": {
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
},
|
||||
"header_footer_type": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"emphasized_text_contents": {
|
||||
"type": "text"
|
||||
},
|
||||
"emphasized_text_tags": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"text_as_html": {
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
},
|
||||
"regex_metadata": {
|
||||
"type": "object"
|
||||
},
|
||||
"detection_class_prob": {
|
||||
"type": "float"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -30,3 +30,13 @@ upstream local connector.
|
||||
For a full list of the options the CLI accepts check ``unstructured-ingest <upstream connector> elasticsearch --help``.
|
||||
|
||||
NOTE: Keep in mind that you will need to have all the appropriate extras and dependencies for the file types of the documents contained in your data storage platform if you're running this locally. You can find more information about this in the `installation guide <https://unstructured-io.github.io/unstructured/installing.html>`_.
|
||||
|
||||
Vector Search Sample Mapping
|
||||
----------------------------
|
||||
|
||||
To make sure the schema of the index matches the data being written to it, a sample mapping json can be used.
|
||||
|
||||
.. literalinclude:: ./data/elasticsearch_elements_mapping.json
|
||||
:language: json
|
||||
:linenos:
|
||||
:caption: Object description
|
42
docs/source/ingest/destination_connectors/opensearch.rst
Normal file
42
docs/source/ingest/destination_connectors/opensearch.rst
Normal file
@ -0,0 +1,42 @@
|
||||
OpenSearch
|
||||
======================
|
||||
|
||||
Batch process all your records using ``unstructured-ingest`` to store structured outputs locally on your filesystem and upload those local files to an OpenSearch index.
|
||||
|
||||
First you'll need to install OpenSearch dependencies as shown here.
|
||||
|
||||
.. code:: shell
|
||||
|
||||
pip install "unstructured[opensearch]"
|
||||
|
||||
Run Locally
|
||||
-----------
|
||||
The upstream connector can be any of the ones supported, but for convenience here, showing a sample command using the
|
||||
upstream local connector.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. tab:: Shell
|
||||
|
||||
.. literalinclude:: ./code/bash/opensearch.sh
|
||||
:language: bash
|
||||
|
||||
.. tab:: Python
|
||||
|
||||
.. literalinclude:: ./code/python/opensearch.py
|
||||
:language: python
|
||||
|
||||
|
||||
For a full list of the options the CLI accepts check ``unstructured-ingest <upstream connector> opensearch --help``.
|
||||
|
||||
NOTE: Keep in mind that you will need to have all the appropriate extras and dependencies for the file types of the documents contained in your data storage platform if you're running this locally. You can find more information about this in the `installation guide <https://unstructured-io.github.io/unstructured/installing.html>`_.
|
||||
|
||||
Vector Search Sample Mapping
|
||||
----------------------------
|
||||
|
||||
To make sure the schema of the index matches the data being written to it, a sample mapping json can be used.
|
||||
|
||||
.. literalinclude:: ./data/opensearch_elements_mapping.json
|
||||
:language: json
|
||||
:linenos:
|
||||
:caption: Object description
|
10
docs/source/ingest/source_connectors/code/bash/opensearch.sh
Normal file
10
docs/source/ingest/source_connectors/code/bash/opensearch.sh
Normal file
@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
unstructured-ingest \
|
||||
opensearch \
|
||||
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
|
||||
--url http://localhost:9200 \
|
||||
--index-name movies \
|
||||
--fields 'ethnicity, director, plot' \
|
||||
--output-dir opensearch-ingest-output \
|
||||
--num-processes 2
|
@ -0,0 +1,12 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
unstructured-ingest \
|
||||
opensearch \
|
||||
--metadata-exclude filename,file_directory,metadata.data_source.date_processed \
|
||||
--url http://localhost:9200 \
|
||||
--index-name movies \
|
||||
--fields 'ethnicity, director, plot' \
|
||||
--output-dir opensearch-ingest-output \
|
||||
--num-processes 2 \
|
||||
--partition-by-api \
|
||||
--api-key "<UNSTRUCTURED-API-KEY>"
|
@ -0,0 +1,25 @@
|
||||
from unstructured.ingest.connector.opensearch import (
|
||||
OpenSearchAccessConfig,
|
||||
SimpleOpenSearchConfig,
|
||||
)
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig
|
||||
from unstructured.ingest.runner import OpenSearchRunner
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = OpenSearchRunner(
|
||||
processor_config=ProcessorConfig(
|
||||
verbose=True,
|
||||
output_dir="opensearch-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
|
||||
),
|
||||
connector_config=SimpleOpenSearchConfig(
|
||||
access_config=OpenSearchAccessConfig(hosts=["http://localhost:9200"]),
|
||||
index_name="movies",
|
||||
fields=["ethnicity", "director", "plot"],
|
||||
),
|
||||
)
|
||||
runner.run()
|
@ -0,0 +1,29 @@
|
||||
import os
|
||||
|
||||
from unstructured.ingest.connector.opensearch import (
|
||||
OpenSearchAccessConfig,
|
||||
SimpleOpenSearchConfig,
|
||||
)
|
||||
from unstructured.ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig
|
||||
from unstructured.ingest.runner import OpenSearchRunner
|
||||
|
||||
if __name__ == "__main__":
|
||||
runner = OpenSearchRunner(
|
||||
processor_config=ProcessorConfig(
|
||||
verbose=True,
|
||||
output_dir="opensearch-ingest-output",
|
||||
num_processes=2,
|
||||
),
|
||||
read_config=ReadConfig(),
|
||||
partition_config=PartitionConfig(
|
||||
metadata_exclude=["filename", "file_directory", "metadata.data_source.date_processed"],
|
||||
partition_by_api=True,
|
||||
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
|
||||
),
|
||||
connector_config=SimpleOpenSearchConfig(
|
||||
access_config=OpenSearchAccessConfig(hosts=["http://localhost:9200"]),
|
||||
index_name="movies",
|
||||
fields=["ethnicity", "director", "plot"],
|
||||
),
|
||||
)
|
||||
runner.run()
|
47
docs/source/ingest/source_connectors/opensearch.rst
Normal file
47
docs/source/ingest/source_connectors/opensearch.rst
Normal file
@ -0,0 +1,47 @@
|
||||
OpenSearch
|
||||
==============
|
||||
Connect OpenSearch to your preprocessing pipeline, and batch process all your documents using ``unstructured-ingest`` to store structured outputs locally on your filesystem.
|
||||
|
||||
First you'll need to install the OpenSearch dependencies as shown here.
|
||||
|
||||
.. code:: shell
|
||||
|
||||
pip install "unstructured[opensearch]"
|
||||
|
||||
Run Locally
|
||||
-----------
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. tab:: Shell
|
||||
|
||||
.. literalinclude:: ./code/bash/opensearch.sh
|
||||
:language: bash
|
||||
|
||||
.. tab:: Python
|
||||
|
||||
.. literalinclude:: ./code/python/opensearch.py
|
||||
:language: python
|
||||
|
||||
Run via the API
|
||||
---------------
|
||||
|
||||
You can also use upstream connectors with the ``unstructured`` API. For this you'll need to use the ``--partition-by-api`` flag and pass in your API key with ``--api-key``.
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. tab:: Shell
|
||||
|
||||
.. literalinclude:: ./code/bash/opensearch_api.sh
|
||||
:language: bash
|
||||
|
||||
.. tab:: Python
|
||||
|
||||
.. literalinclude:: ./code/python/opensearch_api.py
|
||||
:language: python
|
||||
|
||||
Additionally, you will need to pass the ``--partition-endpoint`` if you're running the API locally. You can find more information about the ``unstructured`` API `here <https://github.com/Unstructured-IO/unstructured-api>`_.
|
||||
|
||||
For a full list of the options the CLI accepts check ``unstructured-ingest opensearch --help``.
|
||||
|
||||
NOTE: Keep in mind that you will need to have all the appropriate extras and dependencies for the file types of the documents contained in your data storage platform if you're running this locally. You can find more information about this in the `installation guide <https://unstructured-io.github.io/unstructured/installing.html>`_.
|
@ -39,7 +39,7 @@ To use any of the data connectors, you must install the specific dependency:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
"airtable", "azure", "azure-cognitive-search", "biomed", "box", "confluence", "delta-table", "discord", "dropbox", "elasticsearch", "gcs", "github", "gitlab", "google-drive", "jira", "mongodb", "notion", "onedrive", "outlook", "reddit", "s3", "sharepoint", "salesforce", "slack", "wikipedia"
|
||||
"airtable", "azure", "azure-cognitive-search", "biomed", "box", "confluence", "delta-table", "discord", "dropbox", "elasticsearch", "gcs", "github", "gitlab", "google-drive", "jira", "mongodb", "notion", "opensearch", "onedrive", "outlook", "reddit", "s3", "sharepoint", "salesforce", "slack", "wikipedia"
|
||||
|
||||
Installation with ``conda`` on Windows
|
||||
--------------------------------------
|
||||
|
35
examples/ingest/opensearch/destination.sh
Executable file
35
examples/ingest/opensearch/destination.sh
Executable file
@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Processes a the file from local, chunks, embeds, and writes the results to an opensearch index.
|
||||
|
||||
# Structured outputs are stored in local-to-opensearch/
|
||||
|
||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||
cd "$SCRIPT_DIR"/../../.. || exit 1
|
||||
|
||||
# As an example we're using the local connector,
|
||||
# however ingesting from any supported source connector is possible.
|
||||
# shellcheck disable=2094
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--input-path example-docs/book-war-and-peace-1225p.txt \
|
||||
--output-dir local-to-opensearch \
|
||||
--strategy fast \
|
||||
--chunk-elements \
|
||||
--embedding-provider "<an unstructured embedding provider, ie. langchain-huggingface>" \
|
||||
--num-processes 2 \
|
||||
--verbose \
|
||||
--work-dir "<directory for intermediate outputs to be saved>" \
|
||||
opensearch \
|
||||
--hosts "<List of URLs where opensearch index is served>" \
|
||||
--index-name "<Index name to upload data in>" \
|
||||
--username "<Username to authenticate into the index>" \
|
||||
--password "<Password to authenticate into the index>" \
|
||||
--batch-size-bytes "<Size limit for any batch to be uploaded, in bytes, ie. 15000000>" \
|
||||
--num-processes "<Number of processes to be used to upload, ie. 2>" \
|
||||
--ca-certs "<path/to/ca/certs>" \
|
||||
--client-cert "<path/to/client/cert>" \
|
||||
--client-key "<path/to/client/key>" \
|
||||
--use-ssl \
|
||||
--verify-certs \
|
||||
--ssl-show-warn
|
30
examples/ingest/opensearch/source.sh
Executable file
30
examples/ingest/opensearch/source.sh
Executable file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Runs a docker container to create an opensearch cluster,
|
||||
# fills the ES cluster with data,
|
||||
# processes all the files in the 'movies' index in the cluster using the `unstructured` library.
|
||||
|
||||
# Structured outputs are stored in opensearch-ingest-output
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
sh scripts/opensearch-test-helpers/source_connector/create-fill-and-check-opensearch.sh
|
||||
wait
|
||||
|
||||
# Kill the container so the script can be repeatedly run using the same ports
|
||||
trap 'echo "Stopping opensearch Docker container"; docker stop os-test' EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
opensearch \
|
||||
--hosts "<List of URLs where opensearch index is served>" \
|
||||
--index-name "<Index name to ingest data from>" \
|
||||
--username "<Username to authenticate into the index>" \
|
||||
--password "<Password to authenticate into the index>" \
|
||||
--fields "<If provided, will limit the fields returned by opensearch to this comma-delimited list" \
|
||||
--batch-size "<How many records to read at a time per process>" \
|
||||
--num-processes "<Number of processes to be used to upload, ie. 2>" \
|
||||
--ca-certs "<path/to/ca/certs>" \
|
||||
--client-cert "<path/to/client/cert>" \
|
||||
--client-key "<path/to/client/key>" \
|
||||
--use-ssl \
|
||||
--verify-certs \
|
||||
--ssl-show-warn
|
3
requirements/ingest/opensearch.in
Normal file
3
requirements/ingest/opensearch.in
Normal file
@ -0,0 +1,3 @@
|
||||
-c ../constraints.in
|
||||
-c ../base.txt
|
||||
opensearch-py
|
41
requirements/ingest/opensearch.txt
Normal file
41
requirements/ingest/opensearch.txt
Normal file
@ -0,0 +1,41 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with Python 3.9
|
||||
# by the following command:
|
||||
#
|
||||
# pip-compile --output-file=ingest/opensearch.txt ingest/opensearch.in
|
||||
#
|
||||
certifi==2023.11.17
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# opensearch-py
|
||||
# requests
|
||||
charset-normalizer==3.3.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
idna==3.6
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# requests
|
||||
opensearch-py==2.4.2
|
||||
# via -r ingest/opensearch.in
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# opensearch-py
|
||||
requests==2.31.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# opensearch-py
|
||||
six==1.16.0
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# opensearch-py
|
||||
# python-dateutil
|
||||
urllib3==1.26.18
|
||||
# via
|
||||
# -c ingest/../base.txt
|
||||
# -c ingest/../constraints.in
|
||||
# opensearch-py
|
||||
# requests
|
14
scripts/opensearch-test-helpers/common/docker-compose.yaml
Normal file
14
scripts/opensearch-test-helpers/common/docker-compose.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
services:
|
||||
opensearch:
|
||||
image: opensearchproject/opensearch:2.11.1
|
||||
container_name: opensearch-test
|
||||
ports:
|
||||
- 9200:9200
|
||||
- 9600:9600
|
||||
environment:
|
||||
- discovery.type=single-node
|
||||
healthcheck:
|
||||
test: curl --fail https://localhost:9200/_cat/health -ku 'admin:admin' >/dev/null || exit 1
|
||||
interval: 10s
|
||||
timeout: 30s
|
||||
retries: 3
|
@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(dirname "$(realpath "$0")")")
|
||||
|
||||
# Create the Opensearch cluster
|
||||
docker compose version
|
||||
docker compose -f "$SCRIPT_DIR"/common/docker-compose.yaml up --wait
|
||||
docker compose -f "$SCRIPT_DIR"/common/docker-compose.yaml ps
|
||||
|
||||
echo "Cluster is live."
|
||||
python "$SCRIPT_DIR"/destination_connector/create_index.py
|
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from opensearch_cluster_config import (
|
||||
INDEX_NAME,
|
||||
mappings,
|
||||
)
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
print("Connecting to the OpenSearch cluster.")
|
||||
client = OpenSearch(
|
||||
hosts=[{"host": "localhost", "port": 9200}],
|
||||
http_auth=("admin", "admin"),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
)
|
||||
print(client.info())
|
||||
|
||||
print("Creating an OpenSearch index for testing ingest opensearch destination connector.")
|
||||
response = client.indices.create(index=INDEX_NAME, body=mappings)
|
||||
if not response["acknowledged"]:
|
||||
raise RuntimeError("failed to create index")
|
||||
|
||||
print("Succesfully created an OpenSearch index for testing opensearch ingest.")
|
@ -0,0 +1,10 @@
|
||||
import json
|
||||
|
||||
CLUSTER_URL = "http://localhost:9200"
|
||||
INDEX_NAME = "ingest-test-destination"
|
||||
USER = "admin"
|
||||
PASSWORD = "admin"
|
||||
MAPPING_PATH = "docs/source/ingest/destination_connectors/data/opensearch_elements_mappings.json"
|
||||
|
||||
with open(MAPPING_PATH) as f:
|
||||
mappings = json.load(f)
|
@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
N_ELEMENTS = 5
|
||||
EXPECTED_TEXT = "To Whom it May Concern:"
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Connecting to the OpenSearch cluster.")
|
||||
client = OpenSearch(
|
||||
hosts=[{"host": "localhost", "port": 9200}],
|
||||
http_auth=("admin", "admin"),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
)
|
||||
print(client.info())
|
||||
|
||||
initial_query = {"query": {"simple_query_string": {"fields": ["text"], "query": EXPECTED_TEXT}}}
|
||||
initial_result = client.search(index="ingest-test-destination", body=initial_query)
|
||||
initial_embeddings = initial_result["hits"]["hits"][0]["_source"]["embeddings"]
|
||||
|
||||
query = {"size": 1, "query": {"knn": {"embeddings": {"vector": initial_embeddings, "k": 1}}}}
|
||||
|
||||
vector_search = client.search(index="ingest-test-destination", body=query)
|
||||
|
||||
try:
|
||||
assert vector_search["hits"]["hits"][0]["_source"]["text"] == EXPECTED_TEXT
|
||||
print("OpenSearch vector search test was successful.")
|
||||
except AssertionError:
|
||||
sys.exit(
|
||||
"OpenSearch dest check failed:" f"Did not find {EXPECTED_TEXT} in via vector search."
|
||||
)
|
||||
|
||||
count = int(client.count(index="ingest-test-destination")["count"])
|
||||
try:
|
||||
assert count == N_ELEMENTS
|
||||
except AssertionError:
|
||||
sys.exit(
|
||||
"OpenSearch dest check failed:"
|
||||
f"got {count} items in index, expected {N_ELEMENTS} items in index."
|
||||
)
|
||||
print(f"OpenSearch destination test was successful with {count} items being uploaded.")
|
@ -0,0 +1,13 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(dirname "$(dirname "$(realpath "$0")")")"
|
||||
|
||||
# Create the Opensearch cluster
|
||||
docker compose version
|
||||
docker compose -f "$SCRIPT_DIR"/common/docker-compose.yaml up --wait
|
||||
docker compose -f "$SCRIPT_DIR"/common/docker-compose.yaml ps
|
||||
|
||||
echo "Cluster is live."
|
||||
"$SCRIPT_DIR"/source_connector/create_and_fill_opensearch.py
|
@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env python3
|
||||
from contextlib import suppress
|
||||
|
||||
import pandas as pd
|
||||
from opensearchpy import Document, Keyword, OpenSearch, Text
|
||||
from opensearchpy.exceptions import NotFoundError
|
||||
|
||||
DATA_PATH = "scripts/opensearch-test-helpers/wiki_movie_plots_small.csv"
|
||||
CLUSTER_URL = "http://localhost:9200"
|
||||
INDEX_NAME = "movies"
|
||||
|
||||
|
||||
class Movie(Document):
|
||||
title = Text(fields={"raw": Keyword()})
|
||||
year = Text()
|
||||
director = Text()
|
||||
cast = Text()
|
||||
genre = Text()
|
||||
wiki_page = Text()
|
||||
ethnicity = Text()
|
||||
plot = Text()
|
||||
|
||||
class Index:
|
||||
name = "movies"
|
||||
|
||||
def save(self, **kwargs):
|
||||
return super(Movie, self).save(**kwargs)
|
||||
|
||||
|
||||
print("Connecting to the OpenSearch cluster.")
|
||||
client = OpenSearch(
|
||||
hosts=[{"host": "localhost", "port": 9200}],
|
||||
http_auth=("admin", "admin"),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
)
|
||||
print(client.info())
|
||||
df = pd.read_csv(DATA_PATH).dropna().reset_index()
|
||||
|
||||
with suppress(NotFoundError):
|
||||
client.indices.delete(index="movies")
|
||||
|
||||
print("Creating an OpenSearch index for testing opensearch ingest.")
|
||||
response = client.indices.create(index=INDEX_NAME)
|
||||
if not response.get("acknowledged"):
|
||||
raise RuntimeError("failed to create index")
|
||||
|
||||
for i, row in df.iterrows():
|
||||
Movie.init(using=client)
|
||||
movie = Movie(
|
||||
meta={"id": i},
|
||||
title=row["Title"],
|
||||
year=row["Release Year"],
|
||||
director=row["Director"],
|
||||
cast=row["Cast"],
|
||||
genre=row["Genre"],
|
||||
wiki_page=row["Wiki Page"],
|
||||
ethnicity=row["Origin/Ethnicity"],
|
||||
plot=row["Plot"],
|
||||
)
|
||||
movie.save(using=client)
|
||||
|
||||
client.count()
|
||||
|
||||
print("Successfully created and filled an OpenSearch index for testing opensearch ingest.")
|
31
scripts/opensearch-test-helpers/wiki_movie_plots_small.csv
Normal file
31
scripts/opensearch-test-helpers/wiki_movie_plots_small.csv
Normal file
@ -0,0 +1,31 @@
|
||||
Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
|
||||
1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Smashers,"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"
|
||||
1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon,"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better."
|
||||
1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Presidents,"The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.
|
||||
In the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice."
|
||||
1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King","Lasting just 61 seconds and consisting of two shots, the first shot is set in a wood during winter. The actor representing then vice-president Theodore Roosevelt enthusiastically hurries down a hillside towards a tree in the foreground. He falls once, but rights himself and cocks his rifle. Two other men, bearing signs reading ""His Photographer"" and ""His Press Agent"" respectively, follow him into the shot; the photographer sets up his camera. ""Teddy"" aims his rifle upward at the tree and fells what appears to be a common house cat, which he then proceeds to stab. ""Teddy"" holds his prize aloft, and the press agent takes notes. The second shot is taken in a slightly different part of the wood, on a path. ""Teddy"" rides the path on his horse towards the camera and out to the left of the shot, followed closely by the press agent and photographer, still dutifully holding their signs."
|
||||
1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film),"The earliest known adaptation of the classic fairytale, this films shows Jack trading his cow for the beans, his mother forcing him to drop them in the front yard, and beig forced upstairs. As he sleeps, Jack is visited by a fairy who shows him glimpses of what will await him when he ascends the bean stalk. In this version, Jack is the son of a deposed king. When Jack wakes up, he finds the beanstalk has grown and he climbs to the top where he enters the giant's home. The giant finds Jack, who narrowly escapes. The giant chases Jack down the bean stalk, but Jack is able to cut it down before the giant can get to safety. He falls and is killed as Jack celebrates. The fairy then reveals that Jack may return home as a prince."
|
||||
1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film),"Alice follows a large white rabbit down a ""Rabbit-hole"". She finds a tiny door. When she finds a bottle labeled ""Drink me"", she does, and shrinks, but not enough to pass through the door. She then eats something labeled ""Eat me"" and grows larger. She finds a fan when enables her to shrink enough to get into the ""Garden"" and try to get a ""Dog"" to play with her. She enters the ""White Rabbit's tiny House,"" but suddenly resumes her normal size. In order to get out, she has to use the ""magic fan.""
|
||||
She enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. ""The Duchess's Cheshire Cat"" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's ""Mad Tea-Party."" After a while, she leaves.
|
||||
The Queen invites Alice to join the ""ROYAL PROCESSION"": a parade of marching playing cards and others headed by the White Rabbit. When Alice ""unintentionally offends the Queen"", the latter summons the ""Executioner"". Alice ""boxes the ears"", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream."
|
||||
1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film),"The film opens with two bandits breaking into a railroad telegraph office, where they force the operator at gunpoint to have a train stopped and to transmit orders for the engineer to fill the locomotive's tender at the station's water tank. They then knock the operator out and tie him up. As the train stops it is boarded by the bandits—now four. Two bandits enter an express car, kill a messenger and open a box of valuables with dynamite; the others kill the fireman and force the engineer to halt the train and disconnect the locomotive. The bandits then force the passengers off the train and rifle them for their belongings. One passenger tries to escape but is instantly shot down. Carrying their loot, the bandits escape in the locomotive, later stopping in a valley where their horses had been left.
|
||||
Meanwhile, back in the telegraph office, the bound operator awakens, but he collapses again. His daughter arrives bringing him his meal and cuts him free, and restores him to consciousness by dousing him with water.
|
||||
There is some comic relief at a dance hall, where an Eastern stranger is forced to dance while the locals fire at his feet. The door suddenly opens and the telegraph operator rushes in to tell them of the robbery. The men quickly form a posse, which overtakes the bandits, and in a final shootout kills them all and recovers the stolen mail."
|
||||
1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,"The film is about a family who move to the suburbs, hoping for a quiet life. Things start to go wrong, and the wife gets violent and starts throwing crockery, leading to her arrest."
|
||||
1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train_Robbery,"The opening scene shows the interior of the robbers' den. The walls are decorated with the portraits of notorious criminals and pictures illustrating the exploits of famous bandits. Some of the gang are lounging about, while others are reading novels and illustrated papers. Although of youthful appearance, each is dressed like a typical Western desperado. The ""Bandit Queen,"" leading a blindfolded new recruit, now enters the room. He is led to the center of the room, raises his right hand and is solemnly sworn in. When the bandage is removed from his eyes he finds himself looking into the muzzles of a dozen or more 45's. The gang then congratulates the new member and heartily shake his hand. The ""Bandit Queen"" who is evidently the leader of the gang, now calls for volunteers to hold up a train. All respond, but she picks out seven for the job who immediately leave the cabin.
|
||||
The next scene shows the gang breaking into a barn. They steal ponies and ride away. Upon reaching the place agreed upon they picket their ponies and leaving them in charge of a trusted member proceed to a wild mountain spot in a bend of the railroad, where the road runs over a steep embankment. The spot is an ideal one for holding up a train. Cross ties are now placed on the railroad track and the gang hide in some bushes close by and wait for the train. The train soon approaches and is brought to a stop. The engineer leaves his engine and proceeds to remove the obstruction on the track. While he is bending over one of the gang sneaks up behind them and hits him on the head with an axe, and knocks him senseless down the embankment, while the gang surround the train and hold up the passengers. After securing all the ""valuables,"" consisting principally of candy and dolls, the robbers uncouple the engine and one car and make their escape just in time to avoid a posse of police who appear on the scene. Further up the road they abandon the engine and car, take to the woods and soon reach their ponies.
|
||||
In the meantime the police have learned the particulars of the hold-up from the frightened passengers and have started up the railroad tracks after the fleeing robbers. The robbers are next seen riding up the bed of a shallow stream and finally reach their den, where the remainder of the gang have been waiting for them. Believing they have successfully eluded their pursuers, they proceed to divide the ""plunder."" The police, however, have struck the right trail and are in close pursuit. While the ""plunder"" is being divided a sentry gives the alarm and the entire gang, abandoning everything, rush from the cabin barely in time to escape capture. The police make a hurried search and again start in pursuit. The robbers are so hard pressed that they are unable to reach their ponies, and are obliged to take chances on foot. The police now get in sight of the fleeing robbers and a lively chase follows through tall weeds, over a bridge and up a steep hill. Reaching a pond the police are close on their heels. The foremost robbers jump in clothes and all and strike out for the opposite bank. Two hesitate and are captured. Boats are secured and after an exciting tussle the entire gang is rounded up. In the mix up one of the police is dragged overboard. The final scene shows the entire gang of bedraggled and crestfallen robbers tied together with a rope and being led away by the police. Two of the police are loaded down with revolvers, knives and cartridge belts, and resemble walking aresenals. As a fitting climax a confederate steals out of the woods, cuts the rope and gallantly rescues the ""Bandit Queen."""
|
||||
1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film),"Scenes are introduced using lines of the poem.[2] Santa Claus, played by Harry Eytinge, is shown feeding real reindeer[4] and finishes his work in the workshop. Meanwhile, the children of a city household hang their stockings and go to bed, but unable to sleep they engage in a pillow fight. Santa Claus leaves his home on a sleigh with his reindeer. He enters the children's house through the chimney, and leaves the presents. The children come down the stairs and enjoy their presents."
|
||||
1906,Dream of a Rarebit Fiend,American,Wallace McCutcheon and Edwin S. Porter,,short,https://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film),"The Rarebit Fiend gorges on Welsh rarebit at a restaurant. When he leaves, he begins to get dizzy as he starts to hallucinate. He desperately tries to hang onto a lamppost as the world spins all around him. A man helps him get home. He falls into bed and begins having more hallucinatory dreams. During a dream sequence, the furniture begins moving around the room. Imps emerge from a floating Welsh rarebit container and begin poking his head as he sleeps. His bed then begins dancing and spinning wildly around the room before flying out the window with the Fiend in it. The bed floats across the city as the Fiend floats up and off the bed. He hangs off the back and eventually gets caught on a weathervane atop a steeple. His bedclothes tear and he falls from the sky, crashing through his bedroom ceiling. The Fiend awakens from the dream after falling out of his bed."
|
||||
1906,From Leadville to Aspen: A Hold-Up in the Rockies,American,Francis J. Marion and Wallace McCutcheon,,short action/crime western,https://en.wikipedia.org/wiki/From_Leadville_to_Aspen:_A_Hold-Up_in_the_Rockies,The film features a train traveling through the Rockies and a hold up created by two thugs placing logs on the line. They systematically rob the wealthy occupants at gunpoint and then make their getaway along the tracks and later by a hi-jacked horse and cart.
|
||||
1906,Kathleen Mavourneen,American,Edwin S. Porter,,short film,https://en.wikipedia.org/wiki/Kathleen_Mavourneen_(1906_film),"Irish villager Kathleen is a tenant of Captain Clearfield, who controls local judges and criminals. Her father owes Clearfield a large debt. Terence O'More saves the village from Clearfield, causing a large celebration.
|
||||
Film historian Charles Musser writes of Porter's adaptation, ""O'More not only rescues Kathleen from the villain but, through marriage, renews the family for another generation.""[1]"
|
||||
1907,Daniel Boone,American,Wallace McCutcheon and Ediwin S. Porter,"William Craven, Florence Lawrence",biographical,https://en.wikipedia.org/wiki/Daniel_Boone_(1907_film),"Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief.[2]"
|
||||
1907,How Brown Saw the Baseball Game,American,Unknown,Unknown,comedy,https://en.wikipedia.org/wiki/How_Brown_Saw_the_Baseball_Game,"Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication.[1]"
|
||||
1907,Laughing Gas,American,Edwin Stanton Porter,"Bertha Regustus, Edward Boulden",comedy,https://en.wikipedia.org/wiki/Laughing_Gas_(film)#1907_Film,"The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets ""catches"" the laughter from her, including a vendor and police officers."
|
||||
1908,The Adventures of Dollie,American,D. W. Griffith,"Arthur V. Johnson, Linda Arvidson",drama,https://en.wikipedia.org/wiki/The_Adventures_of_Dollie,"On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents."
|
||||
1908,The Black Viper,American,D. W. Griffith,D. W. Griffith,drama,https://en.wikipedia.org/wiki/The_Black_Viper,"A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house."
|
||||
1908,A Calamitous Elopement,American,D.W. Griffith,"Harry Solter, Linda Arvidson",comedy,https://en.wikipedia.org/wiki/A_Calamitous_Elopement,"A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings."
|
||||
1908,The Call of the Wild,American,D. W. Griffith,Charles Inslee,adventure,https://en.wikipedia.org/wiki/The_Call_of_the_Wild_(1908_film),"A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as ""The Biograph Girl."""
|
||||
1908,A Christmas Carol,American,Unknown,Tom Ricketts,drama,https://en.wikipedia.org/wiki/A_Christmas_Carol_(1908_film),"No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release.[2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life."
|
||||
1908,The Fight for Freedom,American,D. W. Griffith,"Florence Auer, John G. Adolfi",western,https://en.wikipedia.org/wiki/The_Fight_for_Freedom,"The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town."
|
|
56
test_unstructured_ingest/dest/opensearch.sh
Executable file
56
test_unstructured_ingest/dest/opensearch.sh
Executable file
@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
DEST_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$DEST_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=opensearch-dest
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
CI=${CI:-"false"}
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup {
|
||||
# Index cleanup
|
||||
echo "Stopping OpenSearch Docker container"
|
||||
docker-compose -f scripts/opensearch-test-helpers/common/docker-compose.yaml down --remove-orphans -v
|
||||
|
||||
# Local file cleanup
|
||||
cleanup_dir "$WORK_DIR"
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
if [ "$CI" == "true" ]; then
|
||||
cleanup_dir "$DOWNLOAD_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
echo "Creating opensearch instance"
|
||||
# shellcheck source=/dev/null
|
||||
scripts/opensearch-test-helpers/destination_connector/create-opensearch-instance.sh
|
||||
wait
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--strategy fast \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/fake-memo.pdf \
|
||||
--work-dir "$WORK_DIR" \
|
||||
--embedding-provider "langchain-huggingface" \
|
||||
opensearch \
|
||||
--hosts http://localhost:9200 \
|
||||
--index-name ingest-test-destination \
|
||||
--username "admin" \
|
||||
--password "admin" \
|
||||
--use-ssl \
|
||||
--batch-size-bytes 150 \
|
||||
--num-processes "$max_processes"
|
||||
|
||||
scripts/opensearch-test-helpers/destination_connector/test-ingest-opensearch-output.py
|
@ -0,0 +1,107 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "0",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "a1515877c1c63770057b2615cce25c5d",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "0",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "Alice follows a large white rabbit down a \"Rabbit-hole\". She finds a tiny door. When she finds a bottle labeled \"Drink me\", she does, and shrinks, but not enough to pass through the door. She then eats something labeled \"Eat me\" and grows larger. She finds a fan when enables her to shrink enough to get into the \"Garden\" and try to get a \"Dog\" to play with her. She enters the \"White Rabbit's tiny House,\" but suddenly resumes her normal size. In order to get out, she has to use the \"magic fan.\"",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "58ec505c394f8af4fc5c62bad6973652",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "0",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "She enters a kitchen, in which there is a cook and a woman holding a baby. She persuades the woman to give her the child and takes the infant outside after the cook starts throwing things around. The baby then turns into a pig and squirms out of her grip. \"The Duchess's Cheshire Cat\" appears and disappears a couple of times to Alice and directs her to the Mad Hatter's \"Mad Tea-Party.\" After a while, she leaves.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "fffac28d27f8cea00e96f1e876a1d1f8",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "0",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "The Queen invites Alice to join the \"ROYAL PROCESSION\": a parade of marching playing cards and others headed by the White Rabbit. When Alice \"unintentionally offends the Queen\", the latter summons the \"Executioner\". Alice \"boxes the ears\", then flees when all the playing cards come for her. Then she wakes up and realizes it was all a dream.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "0d15a8bf4961deb609a392a8444e3520",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "0",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "Cecil Hepworth",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "1",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "d30707943c5b8e45088e21b0a9ba6f1a",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "1",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "Boone's daughter befriends an Indian maiden as Boone and his companion start out on a hunting expedition. While he is away, Boone's cabin is attacked by the Indians, who set it on fire and abduct Boone's daughter. Boone returns, swears vengeance, then heads out on the trail to the Indian camp. His daughter escapes but is chased. The Indians encounter Boone, which sets off a huge fight on the edge of a cliff. A burning arrow gets shot into the Indian camp. Boone gets tied to the stake and tortured. The burning arrow sets the Indian camp on fire, causing panic. Boone is rescued by his horse, and Boone has a knife fight in which he kills the Indian chief. [2]",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "576608bb13aa67420e79d575e0e26071",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "1",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "Wallace McCutcheon and Ediwin S. Porter",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "2",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "2659129a67b301911027d0ea747109e4",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "2",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "Before heading out to a baseball game at a nearby ballpark, sports fan Mr. Brown drinks several highball cocktails. He arrives at the ballpark to watch the game, but has become so inebriated that the game appears to him in reverse, with the players running the bases backwards and the baseball flying back into the pitcher's hand. After the game is over, Mr. Brown is escorted home by one of his friends. When they arrive at Brown's house, they encounter his wife who becomes furious with the friend and proceeds to physically assault him, believing he is responsible for her husband's severe intoxication. [1]",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "b764cdc0eab7137467211272fa539f12",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "2",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "Unknown",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "3",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "6ead4aca7b509813147c41699dd1a7d4",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "3",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "The plot is that of a black woman going to the dentist for a toothache and being given laughing gas. On her way walking home, and in other situations, she can't stop laughing, and everyone she meets \"catches\" the laughter from her, including a vendor and police officers.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "9496aba3ea633310e2d669820269ad00",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "3",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "Edwin Stanton Porter",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "4",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "b8004022d0994669c5fdb4ec8a5088a9",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "4",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "On a beautiful summer day a father and mother take their daughter Dollie on an outing to the river. The mother refuses to buy a gypsy's wares. The gypsy tries to rob the mother, but the father drives him off. The gypsy returns to the camp and devises a plan. They return and kidnap Dollie while her parents are distracted. A rescue crew is organized, but the gypsy takes Dollie to his camp. They gag Dollie and hide her in a barrel before the rescue party gets to the camp. Once they leave the gypsies and escapes in their wagon. As the wagon crosses the river, the barrel falls into the water. Still sealed in the barrel, Dollie is swept downstream in dangerous currents. A boy who is fishing in the river finds the barrel, and Dollie is reunited safely with her parents.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "4",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "D. W. Griffith",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "5",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "8e16a508f3df737af12e84d9cba2c7d0",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "5",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "A thug accosts a girl as she leaves her workplace but a man rescues her. The thug vows revenge and, with the help of two friends, attacks the girl and her rescuer again as they're going for a walk. This time they succeed in kidnapping the rescuer. He is bound and gagged and taken away in a cart. The girl runs home and gets help from several neighbors. They track the ruffians down to a cabin in the mountains where the gang has trapped their victim and set the cabin on fire. A thug and Rescuer fight on the roof of the house.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "5",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "D. W. Griffith",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "6",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "9e92dee6e0d6ef246f51d7f8f4eb8c01",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "6",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "A young couple decides to elope after being caught in the midst of a romantic moment by the woman's angry father. They make plans to leave, but a thief discovers their plans and hides in their trunk and waits for the right moment to steal their belongings.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "2ac7798b427181278fb2b450e28f4902",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "6",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "D.W. Griffith",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "7",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "d366dfc3239f22e3c03ee629f6567a68",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "7",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "A white girl (Florence Lawrence) rejects a proposal from an Indian brave (Charles Inslee) in this early one-reel Western melodrama. Despite the rejection, the Indian still comes to the girl's defense when she is abducted by his warring tribe. In her first year in films, Florence Lawrence was already the most popular among the Biograph Company's anonymous stock company players. By 1909, she was known the world over as \"The Biograph Girl.\"",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "7",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "D. W. Griffith",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "8",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "7ddfc82896f749f2c5b5c5baac5a93bf",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "8",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "No prints of the first American film adaptation of A Christmas Carol are known to exist,[1] but The Moving Picture World magazine provided a scene-by-scene description before the film's release. [2] Scrooge goes into his office and begins working. His nephew, along with three women who wish for Scrooge to donate enter. However, Scrooge dismisses them. On the night of Christmas Eve, his long-dead partner Jacob Marley comes as a ghost, warning him of a horrible fate if he does not change his ways. Scrooge meets three spirits that show Scrooge the real meaning of Christmas, along with his grave, the result of his parsimonious ways. The next morning, he wakes and realizes the error of his ways. Scrooge was then euphoric and generous for the rest of his life.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "b764cdc0eab7137467211272fa539f12",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "8",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "Unknown",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
@ -0,0 +1,65 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0baaf6f25d1c5da82d9e4a3229cd45bd",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "9",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "American",
|
||||
"type": "Title"
|
||||
},
|
||||
{
|
||||
"element_id": "b87d0bbbe5c735bca621fc172fc44605",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "9",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "The film opens in a town on the Mexican border. A poker game is going on in the local saloon. One of the players cheats and is shot dead by another of the players, a Mexican named Pedro. In the uproar that follows Pedro is wounded as he escapes from the saloon. The sheriff is called, who tracks Pedro to his home but Pedro kills the sherriff too. While Pedro hides, his wife Juanita, is arrested on suspicion of murdering the sheriff. Pedro rescues her from the town jail and the two head for the Mexican border. Caught by the posse before they reach the border, Juanita is killed and the film ends with Pedro being arrested and taken back to town.",
|
||||
"type": "NarrativeText"
|
||||
},
|
||||
{
|
||||
"element_id": "9abdc842ab4bacbaa4da45cec2ef7e0d",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"record_locator": {
|
||||
"document_id": "9",
|
||||
"hosts": [
|
||||
"http://localhost:9200"
|
||||
],
|
||||
"index_name": "movies"
|
||||
},
|
||||
"version": 1
|
||||
},
|
||||
"filetype": "text/plain",
|
||||
"languages": [
|
||||
"eng"
|
||||
]
|
||||
},
|
||||
"text": "D. W. Griffith",
|
||||
"type": "Title"
|
||||
}
|
||||
]
|
57
test_unstructured_ingest/src/opensearch.sh
Executable file
57
test_unstructured_ingest/src/opensearch.sh
Executable file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SRC_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
echo "SCRIPT_DIR: $SCRIPT_DIR"
|
||||
OUTPUT_FOLDER_NAME=opensearch
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
DOWNLOAD_DIR=$SCRIPT_DIR/download/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
CI=${CI:-"false"}
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
|
||||
function cleanup() {
|
||||
# Kill the container so the script can be repeatedly run using the same ports
|
||||
echo "Stopping OpenSearch Docker container"
|
||||
docker-compose -f scripts/opensearch-test-helpers/common/docker-compose.yaml down --remove-orphans -v
|
||||
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
if [ "$CI" == "true" ]; then
|
||||
cleanup_dir "$DOWNLOAD_DIR"
|
||||
fi
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
scripts/opensearch-test-helpers/source_connector/create-and-check-opensearch.sh
|
||||
wait
|
||||
|
||||
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
|
||||
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
opensearch \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--num-processes "$max_processes" \
|
||||
--preserve-downloads \
|
||||
--reprocess \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose \
|
||||
--index-name movies \
|
||||
--hosts http://localhost:9200 \
|
||||
--username admin \
|
||||
--password admin \
|
||||
--use-ssl \
|
||||
--fields 'ethnicity,director,plot' \
|
||||
--work-dir "$WORK_DIR" \
|
||||
--batch-size 2
|
||||
|
||||
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
@ -31,6 +31,7 @@ all_tests=(
|
||||
'sharepoint-embed-cog-index.sh'
|
||||
'sqlite.sh'
|
||||
'weaviate.sh'
|
||||
'opensearch.sh'
|
||||
)
|
||||
|
||||
full_python_matrix_tests=(
|
||||
|
@ -55,6 +55,7 @@ all_tests=(
|
||||
'local-embed.sh'
|
||||
'sftp.sh'
|
||||
'mongodb.sh'
|
||||
'opensearch.sh'
|
||||
)
|
||||
|
||||
full_python_matrix_tests=(
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.12.1-dev9" # pragma: no cover
|
||||
__version__ = "0.12.1-dev10" # pragma: no cover
|
||||
|
@ -38,6 +38,8 @@ from .mongodb import get_base_dest_cmd as mongo_base_dest_cmd
|
||||
from .mongodb import get_base_src_cmd as mongodb_base_src_cmd
|
||||
from .notion import get_base_src_cmd as notion_base_src_cmd
|
||||
from .onedrive import get_base_src_cmd as onedrive_base_src_cmd
|
||||
from .opensearch import get_base_dest_cmd as opensearch_base_dest_cmd
|
||||
from .opensearch import get_base_src_cmd as opensearch_base_src_cmd
|
||||
from .outlook import get_base_src_cmd as outlook_base_src_cmd
|
||||
from .pinecone import get_base_dest_cmd as pinecone_base_dest_cmd
|
||||
from .qdrant import get_base_dest_cmd as qdrant_base_dest_cmd
|
||||
@ -73,6 +75,7 @@ base_src_cmd_fns: t.List[t.Callable[[], BaseSrcCmd]] = [
|
||||
mongodb_base_src_cmd,
|
||||
notion_base_src_cmd,
|
||||
onedrive_base_src_cmd,
|
||||
opensearch_base_src_cmd,
|
||||
outlook_base_src_cmd,
|
||||
reddit_base_src_cmd,
|
||||
salesforce_base_src_cmd,
|
||||
@ -109,6 +112,7 @@ base_dest_cmd_fns: t.List[t.Callable[[], "BaseDestCmd"]] = [
|
||||
mongo_base_dest_cmd,
|
||||
pinecone_base_dest_cmd,
|
||||
qdrant_base_dest_cmd,
|
||||
opensearch_base_dest_cmd,
|
||||
]
|
||||
|
||||
# Make sure there are not overlapping names
|
||||
|
117
unstructured/ingest/cli/cmds/opensearch.py
Normal file
117
unstructured/ingest/cli/cmds/opensearch.py
Normal file
@ -0,0 +1,117 @@
|
||||
import typing as t
|
||||
from dataclasses import dataclass
|
||||
|
||||
import click
|
||||
|
||||
from unstructured.ingest.cli.base.src import BaseSrcCmd
|
||||
from unstructured.ingest.cli.cmds.elasticsearch import ElasticsearchCliWriteConfig
|
||||
from unstructured.ingest.cli.interfaces import CliConfig, DelimitedString
|
||||
from unstructured.ingest.connector.opensearch import SimpleOpenSearchConfig
|
||||
|
||||
CMD_NAME = "opensearch"
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenSearchCliConfig(SimpleOpenSearchConfig, CliConfig):
|
||||
@staticmethod
|
||||
def get_cli_options() -> t.List[click.Option]:
|
||||
options = [
|
||||
click.Option(
|
||||
["--index-name"],
|
||||
required=True,
|
||||
type=str,
|
||||
help="Name of the OpenSearch index to pull data from, or upload data to.",
|
||||
),
|
||||
click.Option(
|
||||
["--hosts"],
|
||||
type=DelimitedString(),
|
||||
help='List of the OpenSearch hosts to connect to, e.g. "http://localhost:9200"',
|
||||
),
|
||||
click.Option(
|
||||
["--fields"],
|
||||
type=DelimitedString(),
|
||||
default=[],
|
||||
help="If provided, will limit the fields returned by OpenSearch "
|
||||
"to this comma-delimited list",
|
||||
),
|
||||
click.Option(
|
||||
["--username"], type=str, default=None, help="username when using basic auth"
|
||||
),
|
||||
click.Option(
|
||||
["--password"],
|
||||
type=str,
|
||||
default=None,
|
||||
help="password when using basic auth",
|
||||
),
|
||||
click.Option(
|
||||
["--use-ssl"],
|
||||
type=bool,
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="use ssl for the connection",
|
||||
),
|
||||
click.Option(
|
||||
["--verify-certs"],
|
||||
type=bool,
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="whether to verify SSL certificates",
|
||||
),
|
||||
click.Option(
|
||||
["--ssl-show-warn"],
|
||||
type=bool,
|
||||
default=False,
|
||||
is_flag=True,
|
||||
help="show warning when verify certs is disabled",
|
||||
),
|
||||
click.Option(
|
||||
["--ca-certs"],
|
||||
type=click.Path(),
|
||||
default=None,
|
||||
help="path to CA bundle",
|
||||
),
|
||||
click.Option(
|
||||
["--client-cert"],
|
||||
type=click.Path(),
|
||||
default=None,
|
||||
help="path to the file containing the private key and the certificate,"
|
||||
" or cert only if using client_key",
|
||||
),
|
||||
click.Option(
|
||||
["--client-key"],
|
||||
type=click.Path(),
|
||||
default=None,
|
||||
help="path to the file containing the private key"
|
||||
" if using separate cert and key files",
|
||||
),
|
||||
click.Option(
|
||||
["--batch-size"],
|
||||
default=100,
|
||||
type=click.IntRange(0),
|
||||
help="how many records to read at a time per process",
|
||||
),
|
||||
]
|
||||
return options
|
||||
|
||||
|
||||
def get_base_src_cmd() -> BaseSrcCmd:
|
||||
cmd_cls = BaseSrcCmd(
|
||||
cmd_name="opensearch",
|
||||
cli_config=OpenSearchCliConfig,
|
||||
)
|
||||
return cmd_cls
|
||||
|
||||
|
||||
def get_base_dest_cmd():
|
||||
from unstructured.ingest.cli.base.dest import BaseDestCmd
|
||||
|
||||
cmd_cls = BaseDestCmd(
|
||||
cmd_name="opensearch",
|
||||
cli_config=OpenSearchCliConfig,
|
||||
additional_cli_options=[ElasticsearchCliWriteConfig],
|
||||
addition_configs={
|
||||
"connector_config": SimpleOpenSearchConfig,
|
||||
"write_config": ElasticsearchCliWriteConfig,
|
||||
},
|
||||
)
|
||||
return cmd_cls
|
@ -388,6 +388,7 @@ class ElasticsearchDestinationConnector(BaseDestinationConnector):
|
||||
"element_id": element_dict.pop("element_id", None),
|
||||
"embeddings": element_dict.pop("embeddings", None),
|
||||
"text": element_dict.pop("text", None),
|
||||
"type": element_dict.pop("type", None),
|
||||
"metadata": flatten_dict(
|
||||
element_dict.pop("metadata", None),
|
||||
separator="-",
|
||||
|
219
unstructured/ingest/connector/opensearch.py
Normal file
219
unstructured/ingest/connector/opensearch.py
Normal file
@ -0,0 +1,219 @@
|
||||
import typing as t
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from dataclasses_json.core import Json
|
||||
|
||||
from unstructured.ingest.connector.elasticsearch import (
|
||||
ElasticsearchDestinationConnector,
|
||||
ElasticsearchDocumentMeta,
|
||||
ElasticsearchIngestDoc,
|
||||
ElasticsearchIngestDocBatch,
|
||||
ElasticsearchSourceConnector,
|
||||
SimpleElasticsearchConfig,
|
||||
)
|
||||
from unstructured.ingest.enhanced_dataclass import enhanced_field
|
||||
from unstructured.ingest.error import DestinationConnectionError, SourceConnectionError
|
||||
from unstructured.ingest.interfaces import AccessConfig, BaseSingleIngestDoc
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.ingest.utils.data_prep import generator_batching_wbytes
|
||||
from unstructured.staging.base import flatten_dict
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
"""Since the actual OpenSearch project is a fork of Elasticsearch, we are relying
|
||||
heavily on the Elasticsearch connector code, inheriting the functionality as much as possible."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenSearchAccessConfig(AccessConfig):
|
||||
hosts: t.Optional[t.List[str]] = None
|
||||
username: t.Optional[str] = None
|
||||
password: t.Optional[str] = enhanced_field(default=None, sensitive=True)
|
||||
use_ssl: bool = False
|
||||
verify_certs: bool = False
|
||||
ssl_show_warn: bool = False
|
||||
ca_certs: t.Optional[str] = None
|
||||
client_cert: t.Optional[str] = None
|
||||
client_key: t.Optional[str] = None
|
||||
|
||||
def to_dict(self, **kwargs) -> t.Dict[str, Json]:
|
||||
d = super().to_dict(**kwargs)
|
||||
d["http_auth"] = (self.username, self.password)
|
||||
return d
|
||||
|
||||
|
||||
@dataclass
|
||||
class SimpleOpenSearchConfig(SimpleElasticsearchConfig):
|
||||
access_config: OpenSearchAccessConfig = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenSearchIngestDoc(ElasticsearchIngestDoc):
|
||||
"""Class encapsulating fetching a doc and writing processed results (but not
|
||||
doing the processing!).
|
||||
|
||||
Current implementation creates a python OpenSearch client to fetch each doc,
|
||||
rather than creating a client for each thread.
|
||||
"""
|
||||
|
||||
connector_config: SimpleOpenSearchConfig
|
||||
registry_name: str = "opensearch"
|
||||
|
||||
@SourceConnectionError.wrap
|
||||
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
||||
@BaseSingleIngestDoc.skip_if_file_exists
|
||||
def get_file(self):
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenSearchIngestDocBatch(ElasticsearchIngestDocBatch):
|
||||
connector_config: SimpleOpenSearchConfig
|
||||
ingest_docs: t.List[OpenSearchIngestDoc] = field(default_factory=list)
|
||||
registry_name: str = "opensearch_batch"
|
||||
|
||||
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
||||
def _get_docs(self):
|
||||
from opensearchpy import OpenSearch
|
||||
from opensearchpy.helpers import scan
|
||||
|
||||
ops = OpenSearch(**self.connector_config.access_config.to_dict(apply_name_overload=False))
|
||||
scan_query = {
|
||||
"_source": self.connector_config.fields,
|
||||
"version": True,
|
||||
"query": {"ids": {"values": self.list_of_ids}},
|
||||
}
|
||||
|
||||
result = scan(
|
||||
ops,
|
||||
query=scan_query,
|
||||
scroll="1m",
|
||||
index=self.connector_config.index_name,
|
||||
)
|
||||
return list(result)
|
||||
|
||||
@SourceConnectionError.wrap
|
||||
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
||||
def get_files(self):
|
||||
documents = self._get_docs()
|
||||
for doc in documents:
|
||||
ingest_doc = OpenSearchIngestDoc(
|
||||
processor_config=self.processor_config,
|
||||
read_config=self.read_config,
|
||||
connector_config=self.connector_config,
|
||||
document=doc,
|
||||
document_meta=ElasticsearchDocumentMeta(
|
||||
self.connector_config.index_name, doc["_id"]
|
||||
),
|
||||
)
|
||||
ingest_doc.update_source_metadata()
|
||||
doc_body = doc["_source"]
|
||||
filename = ingest_doc.filename
|
||||
flattened_dict = flatten_dict(dictionary=doc_body)
|
||||
str_values = [str(value) for value in flattened_dict.values()]
|
||||
concatenated_values = "\n".join(str_values)
|
||||
|
||||
filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(filename, "w", encoding="utf8") as f:
|
||||
f.write(concatenated_values)
|
||||
self.ingest_docs.append(ingest_doc)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenSearchSourceConnector(ElasticsearchSourceConnector):
|
||||
"""Fetches particular fields from all documents in a given opensearch cluster and index"""
|
||||
|
||||
connector_config: SimpleOpenSearchConfig
|
||||
_ops: t.Optional["OpenSearch"] = field(init=False, default=None)
|
||||
|
||||
@property
|
||||
def ops(self):
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
if self._ops is None:
|
||||
self._ops = OpenSearch(
|
||||
**self.connector_config.access_config.to_dict(apply_name_overload=False)
|
||||
)
|
||||
return self._ops
|
||||
|
||||
def check_connection(self):
|
||||
try:
|
||||
self.ops.ping()
|
||||
except Exception as e:
|
||||
logger.error(f"failed to validate connection: {e}", exc_info=True)
|
||||
raise SourceConnectionError(f"failed to validate connection: {e}")
|
||||
|
||||
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
||||
def _get_doc_ids(self):
|
||||
"""Fetches all document ids in an index"""
|
||||
from opensearchpy.helpers import scan
|
||||
|
||||
hits = scan(
|
||||
self.ops,
|
||||
query=self.scan_query,
|
||||
scroll="1m",
|
||||
index=self.connector_config.index_name,
|
||||
)
|
||||
|
||||
return [hit["_id"] for hit in hits]
|
||||
|
||||
def get_ingest_docs(self):
|
||||
"""Fetches all documents in an index, using ids that are fetched with _get_doc_ids"""
|
||||
ids = self._get_doc_ids()
|
||||
id_batches = [
|
||||
ids[
|
||||
i
|
||||
* self.connector_config.batch_size : (i + 1) # noqa
|
||||
* self.connector_config.batch_size
|
||||
]
|
||||
for i in range(
|
||||
(len(ids) + self.connector_config.batch_size - 1)
|
||||
// self.connector_config.batch_size
|
||||
)
|
||||
]
|
||||
return [
|
||||
OpenSearchIngestDocBatch(
|
||||
connector_config=self.connector_config,
|
||||
processor_config=self.processor_config,
|
||||
read_config=self.read_config,
|
||||
list_of_ids=batched_ids,
|
||||
)
|
||||
for batched_ids in id_batches
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenSearchDestinationConnector(ElasticsearchDestinationConnector):
|
||||
connector_config: SimpleOpenSearchConfig
|
||||
_client: t.Optional["OpenSearch"] = field(init=False, default=None)
|
||||
|
||||
@DestinationConnectionError.wrap
|
||||
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
||||
def generate_client(self) -> "OpenSearch":
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
return OpenSearch(**self.connector_config.access_config.to_dict(apply_name_overload=False))
|
||||
|
||||
@requires_dependencies(["opensearchpy"], extras="opensearch")
|
||||
def write_dict(self, *args, elements_dict: t.List[t.Dict[str, t.Any]]) -> None:
|
||||
logger.info(
|
||||
f"writing document batches to destination"
|
||||
f" index named {self.connector_config.index_name}"
|
||||
f" at {self.connector_config.access_config.hosts}"
|
||||
f" with batch size (in bytes) {self.write_config.batch_size_bytes}"
|
||||
f" with {self.write_config.num_processes} (number of) processes"
|
||||
)
|
||||
from opensearchpy.helpers import parallel_bulk
|
||||
|
||||
for batch in generator_batching_wbytes(
|
||||
elements_dict, batch_size_limit_bytes=self.write_config.batch_size_bytes
|
||||
):
|
||||
for success, info in parallel_bulk(
|
||||
self.client, batch, thread_count=self.write_config.num_processes
|
||||
):
|
||||
if not success:
|
||||
logger.error(
|
||||
"upload failed for a batch in opensearch destination connector:", info
|
||||
)
|
@ -28,6 +28,7 @@ from unstructured.ingest.connector.notion.connector import (
|
||||
NotionPageIngestDoc,
|
||||
)
|
||||
from unstructured.ingest.connector.onedrive import OneDriveIngestDoc
|
||||
from unstructured.ingest.connector.opensearch import OpenSearchIngestDoc, OpenSearchIngestDocBatch
|
||||
from unstructured.ingest.connector.outlook import OutlookIngestDoc
|
||||
from unstructured.ingest.connector.reddit import RedditIngestDoc
|
||||
from unstructured.ingest.connector.salesforce import SalesforceIngestDoc
|
||||
@ -64,6 +65,8 @@ INGEST_DOC_NAME_TO_CLASS: Dict[str, Type[EnhancedDataClassJsonMixin]] = {
|
||||
"notion_database": NotionDatabaseIngestDoc,
|
||||
"notion_page": NotionPageIngestDoc,
|
||||
"onedrive": OneDriveIngestDoc,
|
||||
"opensearch": OpenSearchIngestDoc,
|
||||
"opensearch_batch": OpenSearchIngestDocBatch,
|
||||
"outlook": OutlookIngestDoc,
|
||||
"reddit": RedditIngestDoc,
|
||||
"s3": S3IngestDoc,
|
||||
|
@ -24,6 +24,7 @@ from .local import LocalRunner
|
||||
from .mongodb import MongoDBRunner
|
||||
from .notion import NotionRunner
|
||||
from .onedrive import OneDriveRunner
|
||||
from .opensearch import OpenSearchRunner
|
||||
from .outlook import OutlookRunner
|
||||
from .reddit import RedditRunner
|
||||
from .salesforce import SalesforceRunner
|
||||
@ -53,6 +54,7 @@ runner_map: t.Dict[str, Type[Runner]] = {
|
||||
"mongodb": MongoDBRunner,
|
||||
"notion": NotionRunner,
|
||||
"onedrive": OneDriveRunner,
|
||||
"opensearch": OpenSearchRunner,
|
||||
"outlook": OutlookRunner,
|
||||
"reddit": RedditRunner,
|
||||
"s3": S3Runner,
|
||||
@ -83,6 +85,7 @@ __all__ = [
|
||||
"MongoDBRunner",
|
||||
"NotionRunner",
|
||||
"OneDriveRunner",
|
||||
"OpenSearchRunner",
|
||||
"OutlookRunner",
|
||||
"RedditRunner",
|
||||
"S3Runner",
|
||||
|
40
unstructured/ingest/runner/opensearch.py
Normal file
40
unstructured/ingest/runner/opensearch.py
Normal file
@ -0,0 +1,40 @@
|
||||
import hashlib
|
||||
import typing as t
|
||||
from dataclasses import dataclass
|
||||
|
||||
from unstructured.ingest.interfaces import BaseSourceConnector
|
||||
from unstructured.ingest.logger import logger
|
||||
from unstructured.ingest.runner.base_runner import Runner
|
||||
from unstructured.ingest.runner.utils import update_download_dir_hash
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from unstructured.ingest.connector.opensearch import SimpleOpenSearchConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenSearchRunner(Runner):
|
||||
connector_config: "SimpleOpenSearchConfig"
|
||||
|
||||
def update_read_config(self):
|
||||
hashed_dir_name = hashlib.sha256(
|
||||
"{}_{}".format(
|
||||
",".join(self.connector_config.access_config.hosts),
|
||||
self.connector_config.index_name,
|
||||
).encode(
|
||||
"utf-8",
|
||||
),
|
||||
)
|
||||
|
||||
self.read_config.download_dir = update_download_dir_hash(
|
||||
connector_name="opensearch",
|
||||
read_config=self.read_config,
|
||||
hashed_dir_name=hashed_dir_name,
|
||||
logger=logger,
|
||||
)
|
||||
|
||||
def get_source_connector_cls(self) -> t.Type[BaseSourceConnector]:
|
||||
from unstructured.ingest.connector.opensearch import (
|
||||
OpenSearchSourceConnector,
|
||||
)
|
||||
|
||||
return OpenSearchSourceConnector
|
@ -11,6 +11,7 @@ from .fsspec.dropbox import DropboxWriter
|
||||
from .fsspec.gcs import GcsWriter
|
||||
from .fsspec.s3 import S3Writer
|
||||
from .mongodb import MongodbWriter
|
||||
from .opensearch import OpenSearchWriter
|
||||
from .pinecone import PineconeWriter
|
||||
from .qdrant import QdrantWriter
|
||||
from .sql import SqlWriter
|
||||
@ -26,6 +27,7 @@ writer_map: t.Dict[str, t.Type[Writer]] = {
|
||||
"elasticsearch": ElasticsearchWriter,
|
||||
"gcs": GcsWriter,
|
||||
"mongodb": MongodbWriter,
|
||||
"opensearch": OpenSearchWriter,
|
||||
"pinecone": PineconeWriter,
|
||||
"qdrant": QdrantWriter,
|
||||
"s3": S3Writer,
|
||||
|
26
unstructured/ingest/runner/writers/opensearch.py
Normal file
26
unstructured/ingest/runner/writers/opensearch.py
Normal file
@ -0,0 +1,26 @@
|
||||
import typing as t
|
||||
from dataclasses import dataclass
|
||||
|
||||
from unstructured.ingest.interfaces import BaseDestinationConnector
|
||||
from unstructured.ingest.runner.writers.base_writer import Writer
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
from unstructured.ingest.connector.elasticsearch import (
|
||||
ElasticsearchWriteConfig,
|
||||
)
|
||||
from unstructured.ingest.connector.opensearch import (
|
||||
SimpleOpenSearchConfig,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OpenSearchWriter(Writer):
|
||||
connector_config: "SimpleOpenSearchConfig"
|
||||
write_config: "ElasticsearchWriteConfig"
|
||||
|
||||
def get_connector_cls(self) -> BaseDestinationConnector:
|
||||
from unstructured.ingest.connector.opensearch import (
|
||||
OpenSearchDestinationConnector,
|
||||
)
|
||||
|
||||
return OpenSearchDestinationConnector
|
Loading…
x
Reference in New Issue
Block a user