unstructured/test_unstructured/test_utils.py

from __future__ import annotations

import json
import os

import pytest

from unstructured import utils
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import ElementMetadata, NarrativeText, Title


@pytest.fixture()
def input_data():
    return [
        {"text": "This is a sentence."},
        {"text": "This is another sentence.", "meta": {"score": 0.1}},
    ]


@pytest.fixture()
def output_jsonl_file(tmp_path):
    return os.path.join(tmp_path, "output.jsonl")


@pytest.fixture()
def input_jsonl_file(tmp_path, input_data):
    file_path = os.path.join(tmp_path, "input.jsonl")
    with open(file_path, "w+") as input_file:
        input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
    return file_path


def test_save_as_jsonl(input_data, output_jsonl_file):
    utils.save_as_jsonl(input_data, output_jsonl_file)
    with open(output_jsonl_file) as output_file:
        file_data = [json.loads(line) for line in output_file]
    assert file_data == input_data


def test_read_as_jsonl(input_jsonl_file, input_data):
    file_data = utils.read_from_jsonl(input_jsonl_file)
    assert file_data == input_data


def test_requires_dependencies_decorator():
    @utils.requires_dependencies(dependencies="numpy")
    def test_func():
        import numpy  # noqa: F401

    test_func()


def test_requires_dependencies_decorator_multiple():
    @utils.requires_dependencies(dependencies=["numpy", "pandas"])
    def test_func():
        import numpy  # noqa: F401
        import pandas  # noqa: F401

    test_func()


def test_requires_dependencies_decorator_import_error():
    @utils.requires_dependencies(dependencies="not_a_package")
    def test_func():
        import not_a_package  # noqa: F401

    with pytest.raises(ImportError):
        test_func()


def test_requires_dependencies_decorator_import_error_multiple():
    @utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
    def test_func():
        import not_a_package  # noqa: F401
        import numpy  # noqa: F401

    with pytest.raises(ImportError):
        test_func()


def test_requires_dependencies_decorator_in_class():
    @utils.requires_dependencies(dependencies="numpy")
    class TestClass:
        def __init__(self):
            import numpy  # noqa: F401

    TestClass()


@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10), [0], (0,), range(1)])
def test_first_gives_first(iterator):
    assert utils.first(iterator) == 0


@pytest.mark.parametrize("iterator", [[], ()])
def test_first_raises_if_empty(iterator):
    with pytest.raises(ValueError):
        utils.first(iterator)


@pytest.mark.parametrize("iterator", [[0], (0,), range(1)])
def test_only_gives_only(iterator):
    assert utils.first(iterator) == 0


@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10)])
def test_only_raises_when_len_more_than_1(iterator):
    with pytest.raises(ValueError):
        utils.only(iterator)


@pytest.mark.parametrize("iterator", [[], ()])
def test_only_raises_if_empty(iterator):
    with pytest.raises(ValueError):
        utils.only(iterator)


@pytest.mark.parametrize(
    ("coords1", "coords2", "text1", "text2", "nested_error_tolerance_px", "expectation"),
    [
        (
            ((4, 5), (4, 8), (7, 8), (7, 5)),
            ((2, 3), (2, 6), (5, 6), (5, 3)),
            "Some lovely title",
            "Some lovely text",
            5,  # large nested_error_tolerance_px
            {
                "overlapping_elements": ["Title(ix=0)", "NarrativeText(ix=1)"],
                "parent_element": "Title(ix=0)",
                "overlapping_case": "nested NarrativeText in Title",
                "overlap_percentage": "100%",
                "metadata": {
                    "largest_ngram_percentage": 0,
                    "overlap_percentage_total": "5.88%",
                    "max_area": "9pxˆ2",
                    "min_area": "9pxˆ2",
                    "total_area": "18pxˆ2",
                },
            },
        ),
        (
            ((4, 5), (4, 8), (7, 8), (7, 5)),
            ((2, 3), (2, 6), (5, 6), (5, 3)),
            "Some lovely title",
            "Some lovely text",
            1,  # small nested_error_tolerance_px
            {
                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
                "parent_element": None,
                "overlapping_case": "partial overlap sharing 50.0% of the text from1. "
                "NarrativeText(2-gram)",
                "overlap_percentage": "11.11%",
                "metadata": {
                    "largest_ngram_percentage": 50.0,
                    "overlap_percentage_total": "5.88%",
                    "max_area": "9pxˆ2",
                    "min_area": "9pxˆ2",
                    "total_area": "18pxˆ2",
                },
            },
        ),
        (
            ((4, 5), (4, 8), (7, 8), (7, 5)),
            ((2, 3), (2, 6), (5, 6), (5, 3)),
            "Some lovely title",
            "Some lovely title",  # same title
            1,
            {
                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
                "parent_element": None,
                "overlapping_case": "partial overlap with duplicate text",
                "overlap_percentage": "11.11%",
                "metadata": {
                    "largest_ngram_percentage": 0,
                    "overlap_percentage_total": "5.88%",
                    "max_area": "9pxˆ2",
                    "min_area": "9pxˆ2",
                    "total_area": "18pxˆ2",
                },
            },
        ),
        (
            ((4, 5), (4, 8), (7, 8), (7, 5)),
            ((2, 3), (2, 6), (5, 6), (5, 3)),
            "Some lovely title",
            "",  # empty title
            1,
            {
                "overlapping_elements": ["1. NarrativeText(ix=1)", "0. Title(ix=0)"],
                "parent_element": None,
                "overlapping_case": ("partial overlap with empty content in 1. NarrativeText"),
                "overlap_percentage": "11.11%",
                "metadata": {
                    "largest_ngram_percentage": 0,
                    "overlap_percentage_total": "5.88%",
                    "max_area": "9pxˆ2",
                    "min_area": "9pxˆ2",
                    "total_area": "18pxˆ2",
                },
            },
        ),
        (
            ((4, 5), (4, 8), (7, 8), (7, 5)),
            ((2, 3), (2, 6), (5, 6), (5, 3)),
            "",  # empty 1st title
            "Some lovely title",
            1,
            {
                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
                "parent_element": None,
                "overlapping_case": "partial overlap with empty content in 0. Title",
                "overlap_percentage": "11.11%",
                "metadata": {
                    "largest_ngram_percentage": 0,
                    "overlap_percentage_total": "5.88%",
                    "max_area": "9pxˆ2",
                    "min_area": "9pxˆ2",
                    "total_area": "18pxˆ2",
                },
            },
        ),
        (
            ((4, 5), (4, 8), (7, 8), (7, 5)),
            ((2, 3), (2, 6), (5, 6), (5, 3)),
            "Some lovely title",
            "Something totally different here",  # diff text
            1,
            {
                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
                "parent_element": None,
                "overlapping_case": "partial overlap without sharing text",
                "overlap_percentage": "11.11%",
                "metadata": {
                    "largest_ngram_percentage": 0,
                    "overlap_percentage_total": "5.88%",
                    "max_area": "9pxˆ2",
                    "min_area": "9pxˆ2",
                    "total_area": "18pxˆ2",
                },
            },
        ),
        (
            ((5, 6), (5, 10), (8, 10), (8, 6)),  # diff coordinates
            ((1, 3), (2, 7), (6, 7), (5, 3)),
            "Some lovely title",
            "Some lovely text",
            1,
            {
                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
                "parent_element": None,
                "overlapping_case": "Small partial overlap",
                "overlap_percentage": "8.33%",
                "metadata": {
                    "largest_ngram_percentage": 0,
                    "overlap_percentage_total": "3.23%",
                    "max_area": "20pxˆ2",
                    "min_area": "12pxˆ2",
                    "total_area": "32pxˆ2",
                },
            },
        ),
    ],
)
def test_catch_overlapping_and_nested_bboxes(
    coords1, coords2, text1, text2, nested_error_tolerance_px, expectation
):
    elements = [
        Title(
            text=text1,
            coordinates=coords1,
            coordinate_system=PixelSpace(width=20, height=20),
            metadata=ElementMetadata(page_number=1),
        ),
        NarrativeText(
            text=text2,
            coordinates=coords2,
            coordinate_system=PixelSpace(width=20, height=20),
            metadata=ElementMetadata(page_number=1),
        ),
    ]
    overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
        elements,
        nested_error_tolerance_px,
        sm_overlap_threshold=10.0,
    )
    assert overlapping_flag is True
    assert overlapping_cases[0] == expectation


def test_catch_overlapping_and_nested_bboxes_non_overlapping_case():
    elements = [
        Title(
            text="Some lovely title",
            coordinates=((4, 6), (4, 7), (7, 7), (7, 6)),
            coordinate_system=PixelSpace(width=20, height=20),
            metadata=ElementMetadata(page_number=1),
        ),
        NarrativeText(
            text="Some lovely text",
            coordinates=((6, 8), (6, 9), (9, 9), (9, 8)),
            coordinate_system=PixelSpace(width=20, height=20),
            metadata=ElementMetadata(page_number=1),
        ),
    ]
    overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
        elements,
        1,
        sm_overlap_threshold=10.0,
    )
    assert overlapping_flag is False
    assert overlapping_cases == []


def test_only_returns_singleton_iterable():
    singleton_iterable = [42]
    result = utils.only(singleton_iterable)
    assert result == 42


def test_only_raises_on_non_singleton_iterable():
    singleton_iterable = [42, 0]
    with pytest.raises(ValueError):
        utils.only(singleton_iterable)


def test_calculate_shared_ngram_percentage_returns_null_vals_for_empty_str():
    str1 = ""
    str2 = "banana orange pineapple"
    n = 2
    percent, common_ngrams = utils.calculate_shared_ngram_percentage(str1, str2, n)
    assert percent == 0
    assert not bool(common_ngrams)
-												Preparing the foundation for better element IDs (#2842)

Part one of the issue described here:
https://github.com/Unstructured-IO/unstructured/issues/2461

It does not change how hashing algorithm works, just reworks how ids are
assigned:
> Element ID Design Principles
> 
> 1. A partitioning function can assign only one of two available ID
types to a returned element: a hash or UUID.
> 2. All elements that are returned come with an ID, which is never
None.
> 3. No matter which type of ID is used, it will always be in string
format.
> 4. Partitioning a document returns elements with hashes as their
default IDs.

Big thanks to @scanny for explaining the current design and suggesting
ways to do it right, especially with chunking.


Here's the next PR in line:
https://github.com/Unstructured-IO/unstructured/pull/2673

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: micmarty-deepsense <micmarty-deepsense@users.noreply.github.com>
											
										
										
											2024-04-16 23:14:53 +02:00
+								from __future__ import annotations
-												feat: Implement utility functions for reading and writing `.jsonl` files (#22)

* Implement save_as_jsonl and read_from_jsonl utility functions

* Add unit tests for save_as_jsonl and read_from_jsonl utility functions

* Add example of using save_as_jsonl with prodigy staging brick

* Bump version and update changelog

* remove accidentally added prodigy json file

* added "the" in jsonl description

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
											
										
										
											2022-10-04 18:51:11 +05:00
+								import json
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								import os
-												feat: Implement utility functions for reading and writing `.jsonl` files (#22)

* Implement save_as_jsonl and read_from_jsonl utility functions

* Add unit tests for save_as_jsonl and read_from_jsonl utility functions

* Add example of using save_as_jsonl with prodigy staging brick

* Bump version and update changelog

* remove accidentally added prodigy json file

* added "the" in jsonl description

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
											
										
										
											2022-10-04 18:51:11 +05:00
+								import pytest
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								from unstructured import utils
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								from unstructured.documents.coordinates import PixelSpace
 								from unstructured.documents.elements import ElementMetadata, NarrativeText, Title
-												feat: Implement utility functions for reading and writing `.jsonl` files (#22)

* Implement save_as_jsonl and read_from_jsonl utility functions

* Add unit tests for save_as_jsonl and read_from_jsonl utility functions

* Add example of using save_as_jsonl with prodigy staging brick

* Bump version and update changelog

* remove accidentally added prodigy json file

* added "the" in jsonl description

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
											
										
										
											2022-10-04 18:51:11 +05:00
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: Implement utility functions for reading and writing `.jsonl` files (#22)

* Implement save_as_jsonl and read_from_jsonl utility functions

* Add unit tests for save_as_jsonl and read_from_jsonl utility functions

* Add example of using save_as_jsonl with prodigy staging brick

* Bump version and update changelog

* remove accidentally added prodigy json file

* added "the" in jsonl description

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
											
										
										
											2022-10-04 18:51:11 +05:00
+								def input_data():
 								    return [
 								        {"text": "This is a sentence."},
 								        {"text": "This is another sentence.", "meta": {"score": 0.1}},
 								    ]
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: Implement utility functions for reading and writing `.jsonl` files (#22)

* Implement save_as_jsonl and read_from_jsonl utility functions

* Add unit tests for save_as_jsonl and read_from_jsonl utility functions

* Add example of using save_as_jsonl with prodigy staging brick

* Bump version and update changelog

* remove accidentally added prodigy json file

* added "the" in jsonl description

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
											
										
										
											2022-10-04 18:51:11 +05:00
+								def output_jsonl_file(tmp_path):
 								    return os.path.join(tmp_path, "output.jsonl")
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								@pytest.fixture()
-												feat: Implement utility functions for reading and writing `.jsonl` files (#22)

* Implement save_as_jsonl and read_from_jsonl utility functions

* Add unit tests for save_as_jsonl and read_from_jsonl utility functions

* Add example of using save_as_jsonl with prodigy staging brick

* Bump version and update changelog

* remove accidentally added prodigy json file

* added "the" in jsonl description

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
											
										
										
											2022-10-04 18:51:11 +05:00
+								def input_jsonl_file(tmp_path, input_data):
 								    file_path = os.path.join(tmp_path, "input.jsonl")
 								    with open(file_path, "w+") as input_file:
 								        input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
 								    return file_path
 								def test_save_as_jsonl(input_data, output_jsonl_file):
 								    utils.save_as_jsonl(input_data, output_jsonl_file)
-												Resolve various style issues to improve overall code quality (#282)

* Apply import sorting

ruff . --select I --fix

* Remove unnecessary open mode parameter

ruff . --select UP015 --fix

* Use f-string formatting rather than .format

* Remove extraneous parentheses

Also use "" instead of str()

* Resolve missing trailing commas

ruff . --select COM --fix

* Rewrite list() and dict() calls using literals

ruff . --select C4 --fix

* Add () to pytest.fixture, use tuples for parametrize, etc.

ruff . --select PT --fix

* Simplify code: merge conditionals, context managers

ruff . --select SIM --fix

* Import without unnecessary alias

ruff . --select PLR0402 --fix

* Apply formatting via black

* Rewrite ValueError somewhat

Slightly unrelated to the rest of the PR

* Apply formatting to tests via black

* Update expected exception message to match
0d81564

* Satisfy E501 line too long in test

* Update changelog & version

* Add ruff to make tidy and test deps

* Run 'make tidy'

* Update changelog & version

* Update changelog & version

* Add ruff to 'check' target

Doing so required me to also fix some non-auto-fixable issues. Two of them I fixed with a noqa: SIM115, but especially the one in __init__ may need some attention. That said, that refactor is out of scope of this PR.
											
										
										
											2023-02-27 17:30:54 +01:00
+								    with open(output_jsonl_file) as output_file:
-												feat: Implement utility functions for reading and writing `.jsonl` files (#22)

* Implement save_as_jsonl and read_from_jsonl utility functions

* Add unit tests for save_as_jsonl and read_from_jsonl utility functions

* Add example of using save_as_jsonl with prodigy staging brick

* Bump version and update changelog

* remove accidentally added prodigy json file

* added "the" in jsonl description

Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
											
										
										
											2022-10-04 18:51:11 +05:00
+								        file_data = [json.loads(line) for line in output_file]
 								    assert file_data == input_data
 								def test_read_as_jsonl(input_jsonl_file, input_data):
 								    file_data = utils.read_from_jsonl(input_jsonl_file)
 								    assert file_data == input_data
-												feat: add `requires_dependencies` decorator (#302)

* Add `requires_dependencies` decorator

* Use `required_dependencies` on Reddit & S3

* Fix bug in `requires_dependencies`

To used named args the decorator needs to be also wrapped

* Add `requires_dependencies` integration tests

* Add `requires_dependencies` in `Competition.md`

* Update `CHANGELOG.md`

* Bump version 0.4.16-dev5

* Ignore `F401` unused imports in `requires_dependencies` tests

* Apply suggestions from code review

* Add `functools.wrap` to keep docs, & annotations

* Use `requires_dependencies` in `GitHubConnector`
											
										
										
											2023-02-28 15:50:39 +01:00
 								def test_requires_dependencies_decorator():
 								    @utils.requires_dependencies(dependencies="numpy")
 								    def test_func():
 								        import numpy  # noqa: F401
 								    test_func()
 								def test_requires_dependencies_decorator_multiple():
 								    @utils.requires_dependencies(dependencies=["numpy", "pandas"])
 								    def test_func():
 								        import numpy  # noqa: F401
 								        import pandas  # noqa: F401
 								    test_func()
 								def test_requires_dependencies_decorator_import_error():
 								    @utils.requires_dependencies(dependencies="not_a_package")
 								    def test_func():
 								        import not_a_package  # noqa: F401
 								    with pytest.raises(ImportError):
 								        test_func()
 								def test_requires_dependencies_decorator_import_error_multiple():
 								    @utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
 								    def test_func():
 								        import not_a_package  # noqa: F401
 								        import numpy  # noqa: F401
 								    with pytest.raises(ImportError):
 								        test_func()
 								def test_requires_dependencies_decorator_in_class():
 								    @utils.requires_dependencies(dependencies="numpy")
 								    class TestClass:
 								        def __init__(self):
 								            import numpy  # noqa: F401
 								    TestClass()
-												chore: process chipper hierarchy (#1634)

PR to support schema changes introduced from [PR
232](https://github.com/Unstructured-IO/unstructured-inference/pull/232)
in `unstructured-inference`.

Specifically what needs to be supported is:
* Change to the way `LayoutElement` from `unstructured-inference` is
structured, specifically that this class is no longer a subclass of
`Rectangle`, and instead `LayoutElement` has a `bbox` property that
captures the location information and a `from_coords` method that allows
construction of a `LayoutElement` directly from coordinates.
* Removal of `LocationlessLayoutElement` since chipper now exports
bounding boxes, and if we need to support elements without bounding
boxes, we can make the `bbox` property mentioned above optional.
* Getting hierarchy data directly from the inference elements rather
than in post-processing
* Don't try to reorder elements received from chipper v2, as they should
already be ordered.

#### Testing:

The following demonstrates that the new version of chipper is inferring
hierarchy.

```python
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", strategy="hi_res", model_name="chipper")
children = [el for el in elements if el.metadata.parent_id is not None]
print(children)

```
Also verify that running the traditional `hi_res` gives different
results:
```python
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", strategy="hi_res")

```

---------

Co-authored-by: Sebastian Laverde Alfonso <lavmlk20201@gmail.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2023-10-12 20:28:46 -05:00
 								@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10), [0], (0,), range(1)])
 								def test_first_gives_first(iterator):
 								    assert utils.first(iterator) == 0
 								@pytest.mark.parametrize("iterator", [[], ()])
 								def test_first_raises_if_empty(iterator):
 								    with pytest.raises(ValueError):
 								        utils.first(iterator)
 								@pytest.mark.parametrize("iterator", [[0], (0,), range(1)])
 								def test_only_gives_only(iterator):
 								    assert utils.first(iterator) == 0
 								@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10)])
 								def test_only_raises_when_len_more_than_1(iterator):
 								    with pytest.raises(ValueError):
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								        utils.only(iterator)
-												chore: process chipper hierarchy (#1634)

PR to support schema changes introduced from [PR
232](https://github.com/Unstructured-IO/unstructured-inference/pull/232)
in `unstructured-inference`.

Specifically what needs to be supported is:
* Change to the way `LayoutElement` from `unstructured-inference` is
structured, specifically that this class is no longer a subclass of
`Rectangle`, and instead `LayoutElement` has a `bbox` property that
captures the location information and a `from_coords` method that allows
construction of a `LayoutElement` directly from coordinates.
* Removal of `LocationlessLayoutElement` since chipper now exports
bounding boxes, and if we need to support elements without bounding
boxes, we can make the `bbox` property mentioned above optional.
* Getting hierarchy data directly from the inference elements rather
than in post-processing
* Don't try to reorder elements received from chipper v2, as they should
already be ordered.

#### Testing:

The following demonstrates that the new version of chipper is inferring
hierarchy.

```python
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", strategy="hi_res", model_name="chipper")
children = [el for el in elements if el.metadata.parent_id is not None]
print(children)

```
Also verify that running the traditional `hi_res` gives different
results:
```python
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", strategy="hi_res")

```

---------

Co-authored-by: Sebastian Laverde Alfonso <lavmlk20201@gmail.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
											
										
										
											2023-10-12 20:28:46 -05:00
 								@pytest.mark.parametrize("iterator", [[], ()])
 								def test_only_raises_if_empty(iterator):
 								    with pytest.raises(ValueError):
 								        utils.only(iterator)
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
 								@pytest.mark.parametrize(
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								    ("coords1", "coords2", "text1", "text2", "nested_error_tolerance_px", "expectation"),
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								    [
 								        (
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            ((4, 5), (4, 8), (7, 8), (7, 5)),
 								            ((2, 3), (2, 6), (5, 6), (5, 3)),
 								            "Some lovely title",
 								            "Some lovely text",
 ,  # large nested_error_tolerance_px
 								            {
 								                "overlapping_elements": ["Title(ix=0)", "NarrativeText(ix=1)"],
 								                "parent_element": "Title(ix=0)",
 								                "overlapping_case": "nested NarrativeText in Title",
 								                "overlap_percentage": "100%",
 								                "metadata": {
 								                    "largest_ngram_percentage": 0,
 								                    "overlap_percentage_total": "5.88%",
 								                    "max_area": "9pxˆ2",
 								                    "min_area": "9pxˆ2",
 								                    "total_area": "18pxˆ2",
 								                },
 								            },
 								        ),
 								        (
 								            ((4, 5), (4, 8), (7, 8), (7, 5)),
 								            ((2, 3), (2, 6), (5, 6), (5, 3)),
 								            "Some lovely title",
 								            "Some lovely text",
 ,  # small nested_error_tolerance_px
 								            {
 								                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
 								                "parent_element": None,
 								                "overlapping_case": "partial overlap sharing 50.0% of the text from1. "
 								                "NarrativeText(2-gram)",
 								                "overlap_percentage": "11.11%",
 								                "metadata": {
 								                    "largest_ngram_percentage": 50.0,
 								                    "overlap_percentage_total": "5.88%",
 								                    "max_area": "9pxˆ2",
 								                    "min_area": "9pxˆ2",
 								                    "total_area": "18pxˆ2",
 								                },
 								            },
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								        ),
 								        (
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            ((4, 5), (4, 8), (7, 8), (7, 5)),
 								            ((2, 3), (2, 6), (5, 6), (5, 3)),
 								            "Some lovely title",
 								            "Some lovely title",  # same title
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+,
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            {
 								                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
 								                "parent_element": None,
 								                "overlapping_case": "partial overlap with duplicate text",
 								                "overlap_percentage": "11.11%",
 								                "metadata": {
 								                    "largest_ngram_percentage": 0,
 								                    "overlap_percentage_total": "5.88%",
 								                    "max_area": "9pxˆ2",
 								                    "min_area": "9pxˆ2",
 								                    "total_area": "18pxˆ2",
 								                },
 								            },
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								        ),
 								        (
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            ((4, 5), (4, 8), (7, 8), (7, 5)),
 								            ((2, 3), (2, 6), (5, 6), (5, 3)),
 								            "Some lovely title",
 								            "",  # empty title
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+,
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            {
 								                "overlapping_elements": ["1. NarrativeText(ix=1)", "0. Title(ix=0)"],
 								                "parent_element": None,
 								                "overlapping_case": ("partial overlap with empty content in 1. NarrativeText"),
 								                "overlap_percentage": "11.11%",
 								                "metadata": {
 								                    "largest_ngram_percentage": 0,
 								                    "overlap_percentage_total": "5.88%",
 								                    "max_area": "9pxˆ2",
 								                    "min_area": "9pxˆ2",
 								                    "total_area": "18pxˆ2",
 								                },
 								            },
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								        ),
 								        (
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            ((4, 5), (4, 8), (7, 8), (7, 5)),
 								            ((2, 3), (2, 6), (5, 6), (5, 3)),
 								            "",  # empty 1st title
 								            "Some lovely title",
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+,
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            {
 								                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
 								                "parent_element": None,
 								                "overlapping_case": "partial overlap with empty content in 0. Title",
 								                "overlap_percentage": "11.11%",
 								                "metadata": {
 								                    "largest_ngram_percentage": 0,
 								                    "overlap_percentage_total": "5.88%",
 								                    "max_area": "9pxˆ2",
 								                    "min_area": "9pxˆ2",
 								                    "total_area": "18pxˆ2",
 								                },
 								            },
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								        ),
 								        (
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            ((4, 5), (4, 8), (7, 8), (7, 5)),
 								            ((2, 3), (2, 6), (5, 6), (5, 3)),
 								            "Some lovely title",
 								            "Something totally different here",  # diff text
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+,
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            {
 								                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
 								                "parent_element": None,
 								                "overlapping_case": "partial overlap without sharing text",
 								                "overlap_percentage": "11.11%",
 								                "metadata": {
 								                    "largest_ngram_percentage": 0,
 								                    "overlap_percentage_total": "5.88%",
 								                    "max_area": "9pxˆ2",
 								                    "min_area": "9pxˆ2",
 								                    "total_area": "18pxˆ2",
 								                },
 								            },
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								        ),
 								        (
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            ((5, 6), (5, 10), (8, 10), (8, 6)),  # diff coordinates
 								            ((1, 3), (2, 7), (6, 7), (5, 3)),
 								            "Some lovely title",
 								            "Some lovely text",
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+,
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								            {
 								                "overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
 								                "parent_element": None,
 								                "overlapping_case": "Small partial overlap",
 								                "overlap_percentage": "8.33%",
 								                "metadata": {
 								                    "largest_ngram_percentage": 0,
 								                    "overlap_percentage_total": "3.23%",
 								                    "max_area": "20pxˆ2",
 								                    "min_area": "12pxˆ2",
 								                    "total_area": "32pxˆ2",
 								                },
 								            },
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								        ),
 								    ],
 								)
 								def test_catch_overlapping_and_nested_bboxes(
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								    coords1, coords2, text1, text2, nested_error_tolerance_px, expectation
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								):
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								    elements = [
 								        Title(
 								            text=text1,
 								            coordinates=coords1,
 								            coordinate_system=PixelSpace(width=20, height=20),
 								            metadata=ElementMetadata(page_number=1),
 								        ),
 								        NarrativeText(
 								            text=text2,
 								            coordinates=coords2,
 								            coordinate_system=PixelSpace(width=20, height=20),
 								            metadata=ElementMetadata(page_number=1),
 								        ),
 								    ]
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								    overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
 								        elements,
 								        nested_error_tolerance_px,
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								        sm_overlap_threshold=10.0,
-												feat: method to catch and classify overlapping bounding boxes (#1803)

We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:

- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.

Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:

- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.

The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**

Here is a snippet to test it:
```
from unstructured.partition.auto import partition

model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
    print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
											
										
										
											2023-10-25 05:17:34 -07:00
+								    )
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								    assert overlapping_flag is True
 								    assert overlapping_cases[0] == expectation
 								def test_catch_overlapping_and_nested_bboxes_non_overlapping_case():
 								    elements = [
 								        Title(
 								            text="Some lovely title",
 								            coordinates=((4, 6), (4, 7), (7, 7), (7, 6)),
 								            coordinate_system=PixelSpace(width=20, height=20),
 								            metadata=ElementMetadata(page_number=1),
 								        ),
 								        NarrativeText(
 								            text="Some lovely text",
 								            coordinates=((6, 8), (6, 9), (9, 9), (9, 8)),
 								            coordinate_system=PixelSpace(width=20, height=20),
 								            metadata=ElementMetadata(page_number=1),
 								        ),
 								    ]
 								    overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
 								        elements,
 ,
 								        sm_overlap_threshold=10.0,
 								    )
 								    assert overlapping_flag is False
 								    assert overlapping_cases == []
-												fix: don't treat double quote enclosed text as JSON (#2544)

### Summary

Closes #2444. Treats JSON serializable content that results in a string
as plain text. Even though this is valid JSON per [RFC
4627](https://www.ietf.org/rfc/rfc4627.txt), this is valid JSON, but in
almost every cases were really want to treat this as a text file.

### Testing

1. Put `"This is not a JSON"` is a text file `notajson.txt`
2. Run the following

```python
from unstructured.file_utils.filetype import _is_text_file_a_json

_is_text_file_a_json(filename="notajson.txt") # Should be False
```
											
										
										
											2024-02-14 08:41:43 -05:00
-												chore: add tests and small fixes in utils.py (#2554)

Linting and typing fixes, and add tests to improve test coverage in
utils.py

On the main branch, run `coverage run -m pytest
test_unstructured/test_utils.py` and then `coverage report -m
unstructured/utils.py` to see test coverage for `utils.py`. Check out to
this branch and do the same. The percent coverage should increase to 88%

---------

Co-authored-by: David Potter <potterdavidm@gmail.com>
											
										
										
											2024-03-06 15:58:10 -06:00
+								def test_only_returns_singleton_iterable():
 								    singleton_iterable = [42]
 								    result = utils.only(singleton_iterable)
 								    assert result == 42
 								def test_only_raises_on_non_singleton_iterable():
 								    singleton_iterable = [42, 0]
 								    with pytest.raises(ValueError):
 								        utils.only(singleton_iterable)
 								def test_calculate_shared_ngram_percentage_returns_null_vals_for_empty_str():
 								    str1 = ""
 								    str2 = "banana orange pineapple"
 								    n = 2
 								    percent, common_ngrams = utils.calculate_shared_ngram_percentage(str1, str2, n)
 								    assert percent == 0
 								    assert not bool(common_ngrams)