mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix: adjust threshold for encoding detection (#894)
* chore: add example doc * fix: adjust encoding recognition threshold value in `detect_file_encoding` * test: add test cases for German characters * chore: update changelog & version
This commit is contained in:
parent
52aced8677
commit
47bc4009a8
@ -1,4 +1,4 @@
|
|||||||
## 0.8.0-dev1
|
## 0.8.0-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -7,6 +7,7 @@
|
|||||||
* Add metadata_filename parameter across all partition functions
|
* Add metadata_filename parameter across all partition functions
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
* Adjust encoding recognition threshold value in `detect_file_encoding`
|
||||||
* Fix KeyError when `isd_to_elements` doesn't find a type
|
* Fix KeyError when `isd_to_elements` doesn't find a type
|
||||||
* Fix _output_filename for local connector, allowing single files to be written correctly to the disk
|
* Fix _output_filename for local connector, allowing single files to be written correctly to the disk
|
||||||
|
|
||||||
|
1
example-docs/fake-html-lang-de.html
Normal file
1
example-docs/fake-html-lang-de.html
Normal file
@ -0,0 +1 @@
|
|||||||
|
<h3 class="l_titel">Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020</h3>
|
@ -12,6 +12,10 @@ from unstructured.partition.html import partition_html
|
|||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
EXPECTED_OUTPUT_LANGUAGE_DE = [
|
||||||
|
Title(text="Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_partition_html_from_filename():
|
def test_partition_html_from_filename():
|
||||||
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
directory = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||||
@ -47,7 +51,7 @@ def test_partition_html_from_filename_raises_encoding_error(filename, encoding,
|
|||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"filename",
|
"filename",
|
||||||
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
|
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
|
||||||
)
|
)
|
||||||
def test_partition_html_from_filename_default_encoding(filename):
|
def test_partition_html_from_filename_default_encoding(filename):
|
||||||
filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
filename_path = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||||
@ -55,6 +59,8 @@ def test_partition_html_from_filename_default_encoding(filename):
|
|||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
for element in elements:
|
for element in elements:
|
||||||
assert element.metadata.filename == filename
|
assert element.metadata.filename == filename
|
||||||
|
if filename == "fake-html-lang-de.html":
|
||||||
|
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
||||||
|
|
||||||
|
|
||||||
def test_partition_html_from_filename_metadata_false():
|
def test_partition_html_from_filename_metadata_false():
|
||||||
@ -108,13 +114,15 @@ def test_partition_html_from_file_raises_encoding_error(filename, encoding, erro
|
|||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"filename",
|
"filename",
|
||||||
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
|
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
|
||||||
)
|
)
|
||||||
def test_partition_html_from_file_default_encoding(filename):
|
def test_partition_html_from_file_default_encoding(filename):
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||||
with open(filename) as f:
|
with open(filename) as f:
|
||||||
elements = partition_html(file=f)
|
elements = partition_html(file=f)
|
||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
|
if filename == "fake-html-lang-de.html":
|
||||||
|
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -133,13 +141,15 @@ def test_partition_html_from_file_rb_raises_encoding_error(filename, encoding, e
|
|||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"filename",
|
"filename",
|
||||||
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html"],
|
["example-10k-utf-16.html", "example-steelJIS-datasheet-utf-16.html", "fake-html-lang-de.html"],
|
||||||
)
|
)
|
||||||
def test_partition_html_from_file_rb_default_encoding(filename):
|
def test_partition_html_from_file_rb_default_encoding(filename):
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", filename)
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
elements = partition_html(file=f)
|
elements = partition_html(file=f)
|
||||||
assert len(elements) > 0
|
assert len(elements) > 0
|
||||||
|
if filename == "fake-html-lang-de.html":
|
||||||
|
assert elements == EXPECTED_OUTPUT_LANGUAGE_DE
|
||||||
|
|
||||||
|
|
||||||
def test_partition_html_from_text():
|
def test_partition_html_from_text():
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.8.0-dev1" # pragma: no cover
|
__version__ = "0.8.0-dev2" # pragma: no cover
|
||||||
|
@ -4,7 +4,7 @@ import chardet
|
|||||||
|
|
||||||
from unstructured.partition.common import convert_to_bytes
|
from unstructured.partition.common import convert_to_bytes
|
||||||
|
|
||||||
ENCODE_REC_THRESHOLD = 0.5
|
ENCODE_REC_THRESHOLD = 0.8
|
||||||
|
|
||||||
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
|
# popular encodings from https://en.wikipedia.org/wiki/Popularity_of_text_encodings
|
||||||
COMMON_ENCODINGS = [
|
COMMON_ENCODINGS = [
|
||||||
|
Loading…
x
Reference in New Issue
Block a user