mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-05 20:37:36 +00:00
fix: remove default encoding for ingest (#1036)
This commit is contained in:
parent
25ca5744cf
commit
cd1df5e8e6
@ -1,4 +1,4 @@
|
|||||||
## 0.9.1-dev8
|
## 0.9.1-dev9
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -20,6 +20,7 @@
|
|||||||
* Pass `file_filename` metadata when partitioning file object
|
* Pass `file_filename` metadata when partitioning file object
|
||||||
* Skip ingest test on missing Slack token
|
* Skip ingest test on missing Slack token
|
||||||
* Add Dropbox variables to CI environments
|
* Add Dropbox variables to CI environments
|
||||||
|
* Remove default encoding for ingest
|
||||||
* Adds new element type `EmailAddress` for recognising email address in the text
|
* Adds new element type `EmailAddress` for recognising email address in the text
|
||||||
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
|
* Simplifies `min_partition` logic; makes partitions falling below the `min_partition`
|
||||||
less likely.
|
less likely.
|
||||||
|
|||||||
@ -12,9 +12,10 @@ EXPECTED_NUM_FILES=$1
|
|||||||
OUTPUT_FOLDER_NAME=$2
|
OUTPUT_FOLDER_NAME=$2
|
||||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
num_files_created="$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)"
|
||||||
|
|
||||||
if [[ "$(find "$OUTPUT_DIR" -type f -exec printf '.' \; | wc -c | xargs)" != "$EXPECTED_NUM_FILES" ]]; then
|
if [[ num_files_created != "$EXPECTED_NUM_FILES" ]]; then
|
||||||
echo
|
echo
|
||||||
echo "$EXPECTED_NUM_FILES files should have been created."
|
echo "ERROR: $num_files_created files created. $EXPECTED_NUM_FILES files should have been created."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -495,7 +495,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "NarrativeText",
|
"type": "NarrativeText",
|
||||||
"element_id": "f3be9748ecd68b20d706548129baa22d",
|
"element_id": "7480a79a5bad8a36f3f7e5d622f0b5f3",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"data_source": {},
|
"data_source": {},
|
||||||
"filetype": "text/html",
|
"filetype": "text/html",
|
||||||
@ -507,7 +507,7 @@
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
|
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\r\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "NarrativeText",
|
"type": "NarrativeText",
|
||||||
|
|||||||
@ -17,4 +17,4 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
--file-glob "*.html" \
|
--file-glob "*.html" \
|
||||||
--input-path example-docs
|
--input-path example-docs
|
||||||
|
|
||||||
sh "$SCRIPT_DIR"/check-num-files-output.sh 9 $OUTPUT_FOLDER_NAME
|
sh "$SCRIPT_DIR"/check-num-files-output.sh 11 $OUTPUT_FOLDER_NAME
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.9.1-dev8" # pragma: no cover
|
__version__ = "0.9.1-dev9" # pragma: no cover
|
||||||
|
|||||||
@ -201,8 +201,9 @@ def add_shared_options(cmd: Command):
|
|||||||
),
|
),
|
||||||
Option(
|
Option(
|
||||||
["--encoding"],
|
["--encoding"],
|
||||||
default="utf-8",
|
default=None,
|
||||||
help="Text encoding to use when reading documents. Default: utf-8",
|
help="Text encoding to use when reading documents. By default the encoding is "
|
||||||
|
"detected automatically.",
|
||||||
),
|
),
|
||||||
Option(
|
Option(
|
||||||
["--api-key"],
|
["--api-key"],
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user