fix: adjust threshold for encoding detection (#894)

* chore: add example doc

* fix: adjust encoding recognition threshold value in `detect_file_encoding`

* test: add test cases for German characters

* chore: update changelog & version
This commit is contained in:
Christine Straub 2023-07-07 06:25:03 -07:00 committed by GitHub
parent 52aced8677
commit 47bc4009a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 18 additions and 6 deletions

View File

@ -1,4 +1,4 @@
## 0.8.0-dev1
## 0.8.0-dev2
### Enhancements
@ -7,6 +7,7 @@
* Add metadata_filename parameter across all partition functions
### Fixes
* Adjust encoding recognition threshold value in `detect_file_encoding`
* Fix KeyError when `isd_to_elements` doesn't find a type
* Fix _output_filename for local connector, allowing single files to be written correctly to the disk

View File

@ -0,0 +1 @@
<h3 class="l_titel">Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020</h3>

View File

@ -12,6 +12,10 @@ from unstructured.partition.html import partition_html
DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXPECTED_OUTPUT_LANGUAGE_DE = [
Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),
]
def test_partition_html_from_filename():
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
@ -47,7 +51,7 @@ def test_partition_html_from_filename_raises_encoding_error(filename, encoding,
@pytest.mark.parametrize(
"filename",
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
)
def test_partition_html_from_filename_default_encoding(filename):
filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
@ -55,6 +59,8 @@ def test_partition_html_from_filename_default_encoding(filename):
assert len(elements) > 0
for element in elements:
assert element.metadata.filename == filename
if filename == "fake-html-lang-de.html":
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
def test_partition_html_from_filename_metadata_false():
@ -108,13 +114,15 @@ def test_partition_html_from_file_raises_encoding_error(filename, encoding, erro
@pytest.mark.parametrize(
"filename",
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
)
def test_partition_html_from_file_default_encoding(filename):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(filename) as f:
elements = partition_html(file=f)
assert len(elements) > 0
if filename == "fake-html-lang-de.html":
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
@pytest.mark.parametrize(
@ -133,13 +141,15 @@ def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, e
@pytest.mark.parametrize(
"filename",
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
)
def test_partition_html_from_file_rb_default_encoding(filename):
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
with open(filename, "rb") as f:
elements = partition_html(file=f)
assert len(elements) > 0
if filename == "fake-html-lang-de.html":
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
def test_partition_html_from_text():

View File

@ -1 +1 @@
__version__ = "0.8.0-dev1" # pragma: no cover
__version__ = "0.8.0-dev2" # pragma: no cover

View File

@ -4,7 +4,7 @@ import chardet
from unstructured.partition.common import convert_to_bytes
ENCODE_REC_THRESHOLD = 0.5
ENCODE_REC_THRESHOLD = 0.8
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
COMMON_ENCODINGS = [