From cd1df5e8e6aff685adbd550459d0b3c8a72eed7c Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Sat, 5 Aug 2023 09:57:45 -0700 Subject: [PATCH] fix: remove default encoding for ingest (#1036) --- CHANGELOG.md | 3 ++- test_unstructured_ingest/check-num-files-output.sh | 5 +++-- .../azure/spring-weather.html.json | 4 ++-- test_unstructured_ingest/test-ingest-local.sh | 2 +- unstructured/__version__.py | 2 +- unstructured/ingest/cli/common.py | 5 +++-- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e91fc4aa..b2afee86a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.9.1-dev8 +## 0.9.1-dev9 ### Enhancements @@ -20,6 +20,7 @@ * Pass `file_filename` metadata when partitioning file object * Skip ingest test on missing Slack token * Add Dropbox variables to CI environments +* Remove default encoding for ingest * Adds new element type `EmailAddress` for recognising email address in the  text * Simplifies `min_partition` logic; makes partitions falling below the `min_partition` less likely. diff --git a/test_unstructured_ingest/check-num-files-output.sh b/test_unstructured_ingest/check-num-files-output.sh index 871d1e8fa..64b407151 100644 --- a/test_unstructured_ingest/check-num-files-output.sh +++ b/test_unstructured_ingest/check-num-files-output.sh @@ -12,9 +12,10 @@ EXPECTED_NUM_FILES=$1 OUTPUT_FOLDER_NAME=$2 SCRIPT_DIR=$(dirname "$(realpath "$0")") OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME +num_files_created="$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)" -if [[ "$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)" != "$EXPECTED_NUM_FILES" ]]; then +if [[ num_files_created != "$EXPECTED_NUM_FILES" ]]; then echo - echo "$EXPECTED_NUM_FILES files should have been created." + echo "ERROR: $num_files_created files created. $EXPECTED_NUM_FILES files should have been created." exit 1 fi diff --git a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json index 502cef33e..b2d94e538 100644 --- a/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json +++ b/test_unstructured_ingest/expected-structured-output/azure/spring-weather.html.json @@ -495,7 +495,7 @@ }, { "type": "NarrativeText", - "element_id": "f3be9748ecd68b20d706548129baa22d", + "element_id": "7480a79a5bad8a36f3f7e5d622f0b5f3", "metadata": { "data_source": {}, "filetype": "text/html", @@ -507,7 +507,7 @@ } ] }, - "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”" + "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”" }, { "type": "NarrativeText", diff --git a/test_unstructured_ingest/test-ingest-local.sh b/test_unstructured_ingest/test-ingest-local.sh index 286b4e05b..732d0135d 100755 --- a/test_unstructured_ingest/test-ingest-local.sh +++ b/test_unstructured_ingest/test-ingest-local.sh @@ -17,4 +17,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \ --file-glob "*.html" \ --input-path example-docs -sh "$SCRIPT_DIR"/check-num-files-output.sh 9 $OUTPUT_FOLDER_NAME +sh "$SCRIPT_DIR"/check-num-files-output.sh 11 $OUTPUT_FOLDER_NAME diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 188e554f9..89d4dca26 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.9.1-dev8" # pragma: no cover +__version__ = "0.9.1-dev9" # pragma: no cover diff --git a/unstructured/ingest/cli/common.py b/unstructured/ingest/cli/common.py index 77c1a0a71..447c5e611 100644 --- a/unstructured/ingest/cli/common.py +++ b/unstructured/ingest/cli/common.py @@ -201,8 +201,9 @@ def add_shared_options(cmd: Command): ), Option( ["--encoding"], - default="utf-8", - help="Text encoding to use when reading documents. Default: utf-8", + default=None, + help="Text encoding to use when reading documents. By default the encoding is " + "detected automatically.", ), Option( ["--api-key"],