fix: remove default encoding for ingest (#1036)

This commit is contained in:
ryannikolaidis 2023-08-05 09:57:45 -07:00 committed by GitHub
parent 25ca5744cf
commit cd1df5e8e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 12 additions and 9 deletions

View File

@ -1,4 +1,4 @@
## 0.9.1-dev8 ## 0.9.1-dev9
### Enhancements ### Enhancements
@ -20,6 +20,7 @@
* Pass `file_filename` metadata when partitioning file object * Pass `file_filename` metadata when partitioning file object
* Skip ingest test on missing Slack token * Skip ingest test on missing Slack token
* Add Dropbox variables to CI environments * Add Dropbox variables to CI environments
* Remove default encoding for ingest
* Adds new element type `EmailAddress` for recognising email address in the  text * Adds new element type `EmailAddress` for recognising email address in the  text
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition` * Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
less likely. less likely.

View File

@ -12,9 +12,10 @@ EXPECTED_NUM_FILES=$1
OUTPUT_FOLDER_NAME=$2 OUTPUT_FOLDER_NAME=$2
SCRIPT_DIR=$(dirname "$(realpath "$0")") SCRIPT_DIR=$(dirname "$(realpath "$0")")
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
num_files_created="$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)"
if [[ "$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)" != "$EXPECTED_NUM_FILES" ]]; then if [[ num_files_created != "$EXPECTED_NUM_FILES" ]]; then
echo echo
echo "$EXPECTED_NUM_FILES files should have been created." echo "ERROR: $num_files_created files created. $EXPECTED_NUM_FILES files should have been created."
exit 1 exit 1
fi fi

View File

@ -495,7 +495,7 @@
}, },
{ {
"type": "NarrativeText", "type": "NarrativeText",
"element_id": "f3be9748ecd68b20d706548129baa22d", "element_id": "7480a79a5bad8a36f3f7e5d622f0b5f3",
"metadata": { "metadata": {
"data_source": {}, "data_source": {},
"filetype": "text/html", "filetype": "text/html",
@ -507,7 +507,7 @@
} }
] ]
}, },
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”" "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
}, },
{ {
"type": "NarrativeText", "type": "NarrativeText",

View File

@ -17,4 +17,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--file-glob "*.html" \ --file-glob "*.html" \
--input-path example-docs --input-path example-docs
sh "$SCRIPT_DIR"/check-num-files-output.sh 9 $OUTPUT_FOLDER_NAME sh "$SCRIPT_DIR"/check-num-files-output.sh 11 $OUTPUT_FOLDER_NAME

View File

@ -1 +1 @@
__version__ = "0.9.1-dev8" # pragma: no cover __version__ = "0.9.1-dev9" # pragma: no cover

View File

@ -201,8 +201,9 @@ def add_shared_options(cmd: Command):
), ),
Option( Option(
["--encoding"], ["--encoding"],
default="utf-8", default=None,
help="Text encoding to use when reading documents. Default: utf-8", help="Text encoding to use when reading documents. By default the encoding is "
"detected automatically.",
), ),
Option( Option(
["--api-key"], ["--api-key"],