diff --git a/example-docs/blank.xlsx b/example-docs/blank.xlsx new file mode 100644 index 000000000..00b1af61d Binary files /dev/null and b/example-docs/blank.xlsx differ diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py index 01d80b10d..d9c791c9f 100644 --- a/test_unstructured/metrics/test_element_type.py +++ b/test_unstructured/metrics/test_element_type.py @@ -40,6 +40,12 @@ def test_get_element_type_frequency(filename, frequency): assert elements_freq == frequency +def test_get_element_type_frequency_zero_len(): + elements = partition(filename=f"example-docs/blank.xlsx") + elements_freq = get_element_type_frequency(elements_to_json(elements)) + assert len(elements_freq) == 0 + + @pytest.mark.parametrize( ("filename", "expected_frequency", "percent_matched"), [ @@ -107,3 +113,11 @@ def test_calculate_element_type_percent_match(filename, expected_frequency, perc round(calculate_element_type_percent_match(elements_frequency, expected_frequency, 0.8), 2) == percent_matched[2] ) + + +def test_calculate_element_type_percent_match_zero_source_output(): + with_frequency = {("Header", None): 1} + elements = partition(filename=f"example-docs/blank.xlsx") + no_frequency = get_element_type_frequency(elements_to_json(elements)) + assert calculate_element_type_percent_match(with_frequency, no_frequency) == 0.0 + assert calculate_element_type_percent_match(no_frequency, with_frequency) == 0.0 diff --git a/test_unstructured/metrics/test_text_extraction.py b/test_unstructured/metrics/test_text_extraction.py index c5921e196..b5f70e33c 100644 --- a/test_unstructured/metrics/test_text_extraction.py +++ b/test_unstructured/metrics/test_text_extraction.py @@ -216,3 +216,10 @@ def test_calculate_percent_missing_text(output_text, source_text, expected_perce text_extraction.calculate_percent_missing_text(output_text, source_text) == expected_percentage ) + + +def test_error_return_type(): + output_elements = partition(filename=f"example-docs/fake-text.txt") + source_elements = partition(filename=f"example-docs/fake-text.txt") + with pytest.raises(ValueError): + text_extraction.calculate_edit_distance(output_elements, source_elements, "typo") \ No newline at end of file diff --git a/test_unstructured/partition/xlsx/test_xlsx.py b/test_unstructured/partition/xlsx/test_xlsx.py index 8fb7a5ac2..c395da477 100644 --- a/test_unstructured/partition/xlsx/test_xlsx.py +++ b/test_unstructured/partition/xlsx/test_xlsx.py @@ -226,6 +226,12 @@ def test_partition_xlsx_subtables(filename="example-docs/vodafone.xlsx"): assert len(elements) == 6 +def test_partition_xlsx_not_find_subtable(filename="example-docs/vodafone.xlsx"): + elements = partition_xlsx(filename, find_subtable=False) + assert sum(isinstance(element, Table) for element in elements) == 1 + assert len(elements) == 1 + + def test_partition_xlsx_element_metadata_has_languages(): filename = "example-docs/stanley-cups.xlsx" elements = partition_xlsx(filename=filename)