mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-31 04:05:24 +00:00
fix: local connector with input path to single file (#2116)
When passed an absolute file path for the input document path, the local connector incorrectly writes the output file to the wrong directory. Also, in the single file input path cases we are currently including parent path as part of the destination writing, instead when a single file is specified as input the output file should be located directly in the specified outputs directory. Note: this change meant that we needed to bump the file path of some expected results. This fixes such that the output in this case is written to `output-dir/input-filename.json`. ## Changes - Fix for incorrect output path of files partitioned via the local connector when the input path is a file path (rather than directory) - Updated single-local-file test to validate the flow where we specify an absolute file path (since this was particularly broken) ## Testing Note: running the updated `local-single-file` test without the changes to the local connector will result in a final output copy of: ``` Copying /Users/ryannikolaidis/Development/unstructured/unstructured/test_unstructured_ingest/workdir/local-single-file/partitioned/a48c2abec07a9a31860429f94e5a6ade.json -> /Users/ryannikolaidis/Development/unstructured/unstructured/test_unstructured_ingest/../example-docs/language-docs/UDHR_first_article_all.txt.json ``` where the output path is the input path and not the expected `output-dir/input-filename.json` Running with this change we can now expect the file at that directory. --------- Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
This commit is contained in:
parent
d623d75d3c
commit
13a23deba6
@ -1,4 +1,4 @@
|
||||
## 0.11.0-dev6
|
||||
## 0.11.0-dev7
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -22,6 +22,7 @@
|
||||
* **Fix some pdfs returning `KeyError: 'N'`** Certain pdfs were throwing this error when being opened by pdfminer. Added a wrapper function for pdfminer that allows these documents to be partitioned.
|
||||
* **Fix mis-splits on `Table` chunks.** Remedies repeated appearance of full `.text_as_html` on metadata of each `TableChunk` split from a `Table` element too large to fit in the chunking window.
|
||||
* **Import tables_agent from inference** so that we don't have to initialize a global table agent in unstructured OCR again
|
||||
* **Fix local connector with absolute input path** When passed an absolute filepath for the input document path, the local connector incorrectly writes the output file to the input file directory. This fixes such that the output in this case is written to `output-dir/input-filename.json`
|
||||
|
||||
## 0.10.30
|
||||
|
||||
|
@ -6,9 +6,9 @@ SRC_PATH=$(dirname "$(realpath "$0")")
|
||||
SCRIPT_DIR=$(dirname "$SRC_PATH")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_FOLDER_NAME=azure-cog-search-dest
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
OUTPUT_FOLDER_NAME=azure-cog-search-dest
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
|
||||
DESTINATION_INDEX="utic-test-ingest-fixtures-output-$(uuidgen)"
|
||||
@ -90,19 +90,19 @@ while [ "$docs_count_remote" -eq 0 ] && [ "$attempt" -lt 6 ]; do
|
||||
--header "api-key: $AZURE_SEARCH_API_KEY" \
|
||||
--header 'content-type: application/json' | jq)
|
||||
|
||||
echo "docs count pulled from Azure: $docs_count_remote"
|
||||
echo "docs count pulled from Azure Cognitive Search: $docs_count_remote"
|
||||
|
||||
attempt=$((attempt+1))
|
||||
done
|
||||
|
||||
|
||||
docs_count_local=0
|
||||
for i in $(jq length "$OUTPUT_DIR"/**/*.json); do
|
||||
for i in $(jq length "$OUTPUT_DIR"/*.json); do
|
||||
docs_count_local=$((docs_count_local+i));
|
||||
done
|
||||
|
||||
|
||||
if [ "$docs_count_remote" -ne "$docs_count_local" ];then
|
||||
echo "Number of docs in Azure $docs_count_remote doesn't match the expected docs: $docs_count_local"
|
||||
echo "Number of docs in Azure Cognitive Search $docs_count_remote doesn't match the expected docs: $docs_count_local"
|
||||
exit 1
|
||||
fi
|
||||
|
@ -73,7 +73,7 @@ expected_num_files=1
|
||||
num_files_in_dropbox=$(curl -X POST https://api.dropboxapi.com/2/files/list_folder \
|
||||
--header "Content-Type: application/json" \
|
||||
--header "Authorization: Bearer $DROPBOX_ACCESS_TOKEN" \
|
||||
--data "{\"path\":\"$DESTINATION_DROPBOX/example-docs/\"}" | jq '.entries | length')
|
||||
--data "{\"path\":\"$DESTINATION_DROPBOX/\"}" | jq '.entries | length')
|
||||
if [ "$num_files_in_dropbox" -ne "$expected_num_files" ]; then
|
||||
echo "Expected $expected_num_files files to be uploaded to dropbox, but found $num_files_in_dropbox files."
|
||||
exit 1
|
||||
|
@ -66,4 +66,4 @@ python "$SCRIPT_DIR"/python/test-ingest-mongodb.py \
|
||||
--database "$MONGODB_DATABASE_NAME" \
|
||||
--collection "$DESTINATION_MONGO_COLLECTION" \
|
||||
check-vector \
|
||||
--output-json "$OUTPUT_ROOT"/structured-output/$OUTPUT_FOLDER_NAME/example-docs/fake-memo.pdf.json
|
||||
--output-json "$OUTPUT_ROOT"/structured-output/$OUTPUT_FOLDER_NAME/fake-memo.pdf.json
|
||||
|
@ -43,7 +43,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
|
||||
# Simply check the number of files uploaded
|
||||
expected_num_files=1
|
||||
num_files_in_s3=$(aws s3 ls "${DESTINATION_S3}example-docs/" --region us-east-2 | grep -c "\.json$")
|
||||
num_files_in_s3=$(aws s3 ls "${DESTINATION_S3}" --region us-east-2 | grep -c "\.json$")
|
||||
if [ "$num_files_in_s3" -ne "$expected_num_files" ]; then
|
||||
echo "Expected $expected_num_files files to be uploaded to s3, but found $num_files_in_s3 files."
|
||||
exit 1
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -9,6 +9,8 @@ OUTPUT_FOLDER_NAME=local-single-file
|
||||
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
|
||||
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
|
||||
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
|
||||
# assigning an absolute path to the input file so that we explicitly test passing an absolute path
|
||||
ABS_INPUT_PATH="$SCRIPT_DIR/../example-docs/language-docs/UDHR_first_article_all.txt"
|
||||
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
||||
|
||||
# shellcheck disable=SC1091
|
||||
@ -29,7 +31,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
|
||||
--additional-partition-args '{"strategy":"ocr_only", "languages":["ind", "est"]}' \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/language-docs/UDHR_first_article_all.txt \
|
||||
--input-path "$ABS_INPUT_PATH" \
|
||||
--work-dir "$WORK_DIR"
|
||||
|
||||
set +e
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.11.0-dev6" # pragma: no cover
|
||||
__version__ = "0.11.0-dev7" # pragma: no cover
|
||||
|
@ -41,10 +41,12 @@ class LocalIngestDoc(BaseIngestDoc):
|
||||
|
||||
@property
|
||||
def base_filename(self) -> t.Optional[str]:
|
||||
download_path = str(Path(self.connector_config.input_path).resolve())
|
||||
full_path = str(self.filename)
|
||||
base_path = full_path.replace(download_path, "")
|
||||
return base_path
|
||||
download_path = Path(self.connector_config.input_path).resolve()
|
||||
full_path = Path(self.filename).resolve()
|
||||
if download_path.is_file():
|
||||
download_path = download_path.parent
|
||||
relative_path = full_path.relative_to(download_path)
|
||||
return str(relative_path)
|
||||
|
||||
@property
|
||||
def filename(self):
|
||||
|
Loading…
x
Reference in New Issue
Block a user