mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
fix: remove default encoding for ingest (#1036)
This commit is contained in:
parent
25ca5744cf
commit
cd1df5e8e6
@ -1,4 +1,4 @@
|
||||
## 0.9.1-dev8
|
||||
## 0.9.1-dev9
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -20,6 +20,7 @@
|
||||
* Pass `file_filename` metadata when partitioning file object
|
||||
* Skip ingest test on missing Slack token
|
||||
* Add Dropbox variables to CI environments
|
||||
* Remove default encoding for ingest
|
||||
* Adds new element type `EmailAddress` for recognising email address in the text
|
||||
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
|
||||
less likely.
|
||||
|
||||
@ -12,9 +12,10 @@ EXPECTED_NUM_FILES=$1
|
||||
OUTPUT_FOLDER_NAME=$2
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
num_files_created="$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)"
|
||||
|
||||
if [[ "$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)" != "$EXPECTED_NUM_FILES" ]]; then
|
||||
if [[ num_files_created != "$EXPECTED_NUM_FILES" ]]; then
|
||||
echo
|
||||
echo "$EXPECTED_NUM_FILES files should have been created."
|
||||
echo "ERROR: $num_files_created files created. $EXPECTED_NUM_FILES files should have been created."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@ -495,7 +495,7 @@
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "f3be9748ecd68b20d706548129baa22d",
|
||||
"element_id": "7480a79a5bad8a36f3f7e5d622f0b5f3",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "text/html",
|
||||
@ -507,7 +507,7 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
|
||||
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
|
||||
@ -17,4 +17,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--file-glob "*.html" \
|
||||
--input-path example-docs
|
||||
|
||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 9 $OUTPUT_FOLDER_NAME
|
||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 11 $OUTPUT_FOLDER_NAME
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.9.1-dev8" # pragma: no cover
|
||||
__version__ = "0.9.1-dev9" # pragma: no cover
|
||||
|
||||
@ -201,8 +201,9 @@ def add_shared_options(cmd: Command):
|
||||
),
|
||||
Option(
|
||||
["--encoding"],
|
||||
default="utf-8",
|
||||
help="Text encoding to use when reading documents. Default: utf-8",
|
||||
default=None,
|
||||
help="Text encoding to use when reading documents. By default the encoding is "
|
||||
"detected automatically.",
|
||||
),
|
||||
Option(
|
||||
["--api-key"],
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user