mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix: adjust threshold for encoding detection (#894)
* chore: add example doc * fix: adjust encoding recognition threshold value in `detect_file_encoding` * test: add test cases for German characters * chore: update changelog & version
This commit is contained in:
parent
52aced8677
commit
47bc4009a8
@ -1,4 +1,4 @@
|
||||
## 0.8.0-dev1
|
||||
## 0.8.0-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
* Add metadata_filename parameter across all partition functions
|
||||
|
||||
### Fixes
|
||||
* Adjust encoding recognition threshold value in `detect_file_encoding`
|
||||
* Fix KeyError when `isd_to_elements` doesn't find a type
|
||||
* Fix _output_filename for local connector, allowing single files to be written correctly to the disk
|
||||
|
||||
|
1
example-docs/fake-html-lang-de.html
Normal file
1
example-docs/fake-html-lang-de.html
Normal file
@ -0,0 +1 @@
|
||||
<h3 class="l_titel">Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020</h3>
|
@ -12,6 +12,10 @@ from unstructured.partition.html import partition_html
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
EXPECTED_OUTPUT_LANGUAGE_DE = [
|
||||
Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),
|
||||
]
|
||||
|
||||
|
||||
def test_partition_html_from_filename():
|
||||
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
@ -47,7 +51,7 @@ def test_partition_html_from_filename_raises_encoding_error(filename, encoding,
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
|
||||
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
|
||||
)
|
||||
def test_partition_html_from_filename_default_encoding(filename):
|
||||
filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
@ -55,6 +59,8 @@ def test_partition_html_from_filename_default_encoding(filename):
|
||||
assert len(elements) > 0
|
||||
for element in elements:
|
||||
assert element.metadata.filename == filename
|
||||
if filename == "fake-html-lang-de.html":
|
||||
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
||||
|
||||
|
||||
def test_partition_html_from_filename_metadata_false():
|
||||
@ -108,13 +114,15 @@ def test_partition_html_from_file_raises_encoding_error(filename, encoding, erro
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
|
||||
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
|
||||
)
|
||||
def test_partition_html_from_file_default_encoding(filename):
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
with open(filename) as f:
|
||||
elements = partition_html(file=f)
|
||||
assert len(elements) > 0
|
||||
if filename == "fake-html-lang-de.html":
|
||||
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -133,13 +141,15 @@ def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, e
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"filename",
|
||||
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
|
||||
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
|
||||
)
|
||||
def test_partition_html_from_file_rb_default_encoding(filename):
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_html(file=f)
|
||||
assert len(elements) > 0
|
||||
if filename == "fake-html-lang-de.html":
|
||||
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
||||
|
||||
|
||||
def test_partition_html_from_text():
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.8.0-dev1" # pragma: no cover
|
||||
__version__ = "0.8.0-dev2" # pragma: no cover
|
||||
|
@ -4,7 +4,7 @@ import chardet
|
||||
|
||||
from unstructured.partition.common import convert_to_bytes
|
||||
|
||||
ENCODE_REC_THRESHOLD = 0.5
|
||||
ENCODE_REC_THRESHOLD = 0.8
|
||||
|
||||
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
|
||||
COMMON_ENCODINGS = [
|
||||
|
Loading…
x
Reference in New Issue
Block a user