mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix: unstructured-ingest embedding KeyError (#1727)
Currently adding the embedding flag to any unstructured-ingest call results in this failure: ``` 2023-10-11 22:42:14,177 MainProcess ERROR 'b8a98c5d963a9dd75847a8f110cbf7c9' multiprocessing.pool.RemoteTraceback: """ Traceback (most recent call last): File "/Users/ryannikolaidis/.pyenv/versions/3.10.11/lib/python3.10/multiprocessing/pool.py", line 125, in worker result = (True, func(*args, **kwds)) File "/Users/ryannikolaidis/.pyenv/versions/3.10.11/lib/python3.10/multiprocessing/pool.py", line 48, in mapstar return list(map(*args)) File "/Users/ryannikolaidis/Development/unstructured/unstructured/unstructured/ingest/pipeline/copy.py", line 14, in run ingest_doc_json = self.pipeline_context.ingest_docs_map[doc_hash] File "<string>", line 2, in __getitem__ File "/Users/ryannikolaidis/.pyenv/versions/3.10.11/lib/python3.10/multiprocessing/managers.py", line 833, in _callmethod raise convert_to_error(kind, result) KeyError: 'b8a98c5d963a9dd75847a8f110cbf7c9' """ ``` This is because the run method for the embedding node is not adding the IngestDoc to the context map. This PR adds that logic and adds a test to validate that the embeddings option works as expected. NOTE: until https://github.com/Unstructured-IO/unstructured/pull/1719 goes in, the expected results include the duplicate element bug, however currently this does at least prove that embeddings are generated and the function doesn't error.
This commit is contained in:
parent
d726963e42
commit
d22044a44c
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -295,6 +295,7 @@ jobs:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
|
||||
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
TABLE_OCR: "tesseract"
|
||||
ENTIRE_PAGE_OCR: "tesseract"
|
||||
CI: "true"
|
||||
@ -327,6 +328,7 @@ jobs:
|
||||
make install-ingest-wikipedia
|
||||
make install-ingest-notion
|
||||
make install-ingest-delta-table
|
||||
pip install "unstructured[openai]"
|
||||
./test_unstructured_ingest/test-ingest.sh
|
||||
|
||||
test_unstructured_api_unit:
|
||||
|
@ -92,6 +92,7 @@ jobs:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
|
||||
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
TABLE_OCR: "tesseract"
|
||||
ENTIRE_PAGE_OCR: "tesseract"
|
||||
OVERWRITE_FIXTURES: "true"
|
||||
@ -124,6 +125,7 @@ jobs:
|
||||
make install-ingest-wikipedia
|
||||
make install-ingest-notion
|
||||
make install-ingest-delta-table
|
||||
pip install "unstructured[openai]"
|
||||
./test_unstructured_ingest/test-ingest.sh
|
||||
|
||||
- name: Save branch name to environment file
|
||||
|
@ -1,4 +1,4 @@
|
||||
## 0.10.22-dev4
|
||||
## 0.10.22-dev5
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
### Fixes
|
||||
|
||||
* **Fixes PDF list parsing creating duplicate list items** Previously a bug in PDF list item parsing caused removal of other elements and duplication of the list items
|
||||
* **Fixes failure when flagging for embeddings through unstructured-ingest** Currently adding the embedding parameter to any connector results in a failure on the copy stage. This is resolves the issue by adding the IngestDoc to the context map in the embedding node's `run` method. This allows users to specify that connectors fetch embeddings without failure.
|
||||
* **Fix ingest pipeline reformat nodes not discoverable** Fixes issue where reformat nodes raise ModuleNotFoundError on import. This was due to the directory was missing `__init__.py` in order to make it discoverable.
|
||||
* **Fix default language in ingest CLI** Previously the default was being set to english which injected potentially incorrect information to downstream language detection libraries. By setting the default to None allows those libraries to better detect what language the text is in the doc being processed.
|
||||
|
||||
|
36
test_unstructured_ingest/test-ingest-embed.sh
Executable file
36
test_unstructured_ingest/test-ingest-embed.sh
Executable file
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=embed
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
source "$SCRIPT_DIR"/cleanup.sh
|
||||
function cleanup() {
|
||||
cleanup_dir "$OUTPUT_DIR"
|
||||
cleanup_dir "$WORK_DIR"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--num-processes "$max_processes" \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
|
||||
--output-dir "$OUTPUT_DIR" \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/fake-text-utf-16.txt \
|
||||
--work-dir "$WORK_DIR" \
|
||||
--embedding-api-key "$OPENAI_API_KEY"
|
||||
|
||||
set +e
|
||||
|
||||
# currently openai encoder is non-deterministic
|
||||
# once we have an alternative encoder that is deterministic, we test the diff here
|
||||
# until then just validating the file was created
|
||||
"$SCRIPT_DIR"/check-num-files-output.sh 1 "$OUTPUT_FOLDER_NAME"
|
@ -43,6 +43,7 @@ all_tests=(
|
||||
'test-ingest-delta-table.sh'
|
||||
'test-ingest-jira.sh'
|
||||
'test-ingest-sharepoint.sh'
|
||||
'test-ingest-embed.sh'
|
||||
)
|
||||
|
||||
full_python_matrix_tests=(
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.10.22-dev4" # pragma: no cover
|
||||
__version__ = "0.10.22-dev5" # pragma: no cover
|
||||
|
@ -35,6 +35,9 @@ class Embedder(ReformatNode):
|
||||
]
|
||||
json_filename = f"{hashed_filename}.json"
|
||||
json_path = (Path(self.get_path()) / json_filename).resolve()
|
||||
self.pipeline_context.ingest_docs_map[
|
||||
hashed_filename
|
||||
] = self.pipeline_context.ingest_docs_map[filename]
|
||||
if not self.pipeline_context.reprocess and json_path.is_file() and json_path.stat().st_size:
|
||||
logger.debug(f"File exists: {json_path}, skipping embedding")
|
||||
return str(json_path)
|
||||
|
Loading…
x
Reference in New Issue
Block a user