fix: remove default encoding for ingest (#1036)

This commit is contained in:
ryannikolaidis 2023-08-05 09:57:45 -07:00 committed by GitHub
parent 25ca5744cf
commit cd1df5e8e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 12 additions and 9 deletions

View File

@ -1,4 +1,4 @@
## 0.9.1-dev8
## 0.9.1-dev9
### Enhancements
@ -20,6 +20,7 @@
* Pass `file_filename` metadata when partitioning file object
* Skip ingest test on missing Slack token
* Add Dropbox variables to CI environments
* Remove default encoding for ingest
* Adds new element type `EmailAddress` for recognising email address in the  text
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
less likely.

View File

@ -12,9 +12,10 @@ EXPECTED_NUM_FILES=$1
OUTPUT_FOLDER_NAME=$2
SCRIPT_DIR=$(dirname "$(realpath "$0")")
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
num_files_created="$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)"
if [[ "$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)" != "$EXPECTED_NUM_FILES" ]]; then
if [[ num_files_created != "$EXPECTED_NUM_FILES" ]]; then
echo
echo "$EXPECTED_NUM_FILES files should have been created."
echo "ERROR: $num_files_created files created. $EXPECTED_NUM_FILES files should have been created."
exit 1
fi

View File

@ -495,7 +495,7 @@
},
{
"type": "NarrativeText",
"element_id": "f3be9748ecd68b20d706548129baa22d",
"element_id": "7480a79a5bad8a36f3f7e5d622f0b5f3",
"metadata": {
"data_source": {},
"filetype": "text/html",
@ -507,7 +507,7 @@
}
]
},
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
},
{
"type": "NarrativeText",

View File

@ -17,4 +17,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--file-glob "*.html" \
--input-path example-docs
sh "$SCRIPT_DIR"/check-num-files-output.sh 9 $OUTPUT_FOLDER_NAME
sh "$SCRIPT_DIR"/check-num-files-output.sh 11 $OUTPUT_FOLDER_NAME

View File

@ -1 +1 @@
__version__ = "0.9.1-dev8" # pragma: no cover
__version__ = "0.9.1-dev9" # pragma: no cover

View File

@ -201,8 +201,9 @@ def add_shared_options(cmd: Command):
),
Option(
["--encoding"],
default="utf-8",
help="Text encoding to use when reading documents. Default: utf-8",
default=None,
help="Text encoding to use when reading documents. By default the encoding is "
"detected automatically.",
),
Option(
["--api-key"],