2024-04-16 23:14:53 +02:00
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
2022-10-04 18:51:11 +05:00
|
|
|
|
import json
|
2023-02-27 17:30:54 +01:00
|
|
|
|
import os
|
|
|
|
|
|
2022-10-04 18:51:11 +05:00
|
|
|
|
import pytest
|
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
|
from unstructured import utils
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
from unstructured.documents.coordinates import PixelSpace
|
|
|
|
|
from unstructured.documents.elements import ElementMetadata, NarrativeText, Title
|
2022-10-04 18:51:11 +05:00
|
|
|
|
|
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
|
@pytest.fixture()
|
2022-10-04 18:51:11 +05:00
|
|
|
|
def input_data():
|
|
|
|
|
return [
|
|
|
|
|
{"text": "This is a sentence."},
|
|
|
|
|
{"text": "This is another sentence.", "meta": {"score": 0.1}},
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
|
@pytest.fixture()
|
2022-10-04 18:51:11 +05:00
|
|
|
|
def output_jsonl_file(tmp_path):
|
|
|
|
|
return os.path.join(tmp_path, "output.jsonl")
|
|
|
|
|
|
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
|
@pytest.fixture()
|
2022-10-04 18:51:11 +05:00
|
|
|
|
def input_jsonl_file(tmp_path, input_data):
|
|
|
|
|
file_path = os.path.join(tmp_path, "input.jsonl")
|
|
|
|
|
with open(file_path, "w+") as input_file:
|
|
|
|
|
input_file.writelines([json.dumps(obj) + "\n" for obj in input_data])
|
|
|
|
|
return file_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_save_as_jsonl(input_data, output_jsonl_file):
|
|
|
|
|
utils.save_as_jsonl(input_data, output_jsonl_file)
|
2023-02-27 17:30:54 +01:00
|
|
|
|
with open(output_jsonl_file) as output_file:
|
2022-10-04 18:51:11 +05:00
|
|
|
|
file_data = [json.loads(line) for line in output_file]
|
|
|
|
|
assert file_data == input_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_read_as_jsonl(input_jsonl_file, input_data):
|
|
|
|
|
file_data = utils.read_from_jsonl(input_jsonl_file)
|
|
|
|
|
assert file_data == input_data
|
2023-02-28 15:50:39 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_requires_dependencies_decorator():
|
|
|
|
|
@utils.requires_dependencies(dependencies="numpy")
|
|
|
|
|
def test_func():
|
|
|
|
|
import numpy # noqa: F401
|
|
|
|
|
|
|
|
|
|
test_func()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_requires_dependencies_decorator_multiple():
|
|
|
|
|
@utils.requires_dependencies(dependencies=["numpy", "pandas"])
|
|
|
|
|
def test_func():
|
|
|
|
|
import numpy # noqa: F401
|
|
|
|
|
import pandas # noqa: F401
|
|
|
|
|
|
|
|
|
|
test_func()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_requires_dependencies_decorator_import_error():
|
|
|
|
|
@utils.requires_dependencies(dependencies="not_a_package")
|
|
|
|
|
def test_func():
|
|
|
|
|
import not_a_package # noqa: F401
|
|
|
|
|
|
|
|
|
|
with pytest.raises(ImportError):
|
|
|
|
|
test_func()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_requires_dependencies_decorator_import_error_multiple():
|
|
|
|
|
@utils.requires_dependencies(dependencies=["not_a_package", "numpy"])
|
|
|
|
|
def test_func():
|
|
|
|
|
import not_a_package # noqa: F401
|
|
|
|
|
import numpy # noqa: F401
|
|
|
|
|
|
|
|
|
|
with pytest.raises(ImportError):
|
|
|
|
|
test_func()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_requires_dependencies_decorator_in_class():
|
|
|
|
|
@utils.requires_dependencies(dependencies="numpy")
|
|
|
|
|
class TestClass:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
import numpy # noqa: F401
|
|
|
|
|
|
|
|
|
|
TestClass()
|
2023-10-12 20:28:46 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10), [0], (0,), range(1)])
|
|
|
|
|
def test_first_gives_first(iterator):
|
|
|
|
|
assert utils.first(iterator) == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("iterator", [[], ()])
|
|
|
|
|
def test_first_raises_if_empty(iterator):
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
utils.first(iterator)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("iterator", [[0], (0,), range(1)])
|
|
|
|
|
def test_only_gives_only(iterator):
|
|
|
|
|
assert utils.first(iterator) == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("iterator", [[0, 1], (0, 1), range(10)])
|
|
|
|
|
def test_only_raises_when_len_more_than_1(iterator):
|
|
|
|
|
with pytest.raises(ValueError):
|
2024-03-06 15:58:10 -06:00
|
|
|
|
utils.only(iterator)
|
2023-10-12 20:28:46 -05:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("iterator", [[], ()])
|
|
|
|
|
def test_only_raises_if_empty(iterator):
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
utils.only(iterator)
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2024-03-06 15:58:10 -06:00
|
|
|
|
("coords1", "coords2", "text1", "text2", "nested_error_tolerance_px", "expectation"),
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
[
|
|
|
|
|
(
|
2024-03-06 15:58:10 -06:00
|
|
|
|
((4, 5), (4, 8), (7, 8), (7, 5)),
|
|
|
|
|
((2, 3), (2, 6), (5, 6), (5, 3)),
|
|
|
|
|
"Some lovely title",
|
|
|
|
|
"Some lovely text",
|
|
|
|
|
5, # large nested_error_tolerance_px
|
|
|
|
|
{
|
|
|
|
|
"overlapping_elements": ["Title(ix=0)", "NarrativeText(ix=1)"],
|
|
|
|
|
"parent_element": "Title(ix=0)",
|
|
|
|
|
"overlapping_case": "nested NarrativeText in Title",
|
|
|
|
|
"overlap_percentage": "100%",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"largest_ngram_percentage": 0,
|
|
|
|
|
"overlap_percentage_total": "5.88%",
|
|
|
|
|
"max_area": "9pxˆ2",
|
|
|
|
|
"min_area": "9pxˆ2",
|
|
|
|
|
"total_area": "18pxˆ2",
|
|
|
|
|
},
|
|
|
|
|
},
|
|
|
|
|
),
|
|
|
|
|
(
|
|
|
|
|
((4, 5), (4, 8), (7, 8), (7, 5)),
|
|
|
|
|
((2, 3), (2, 6), (5, 6), (5, 3)),
|
|
|
|
|
"Some lovely title",
|
|
|
|
|
"Some lovely text",
|
|
|
|
|
1, # small nested_error_tolerance_px
|
|
|
|
|
{
|
|
|
|
|
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
|
|
|
|
"parent_element": None,
|
|
|
|
|
"overlapping_case": "partial overlap sharing 50.0% of the text from1. "
|
|
|
|
|
"NarrativeText(2-gram)",
|
|
|
|
|
"overlap_percentage": "11.11%",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"largest_ngram_percentage": 50.0,
|
|
|
|
|
"overlap_percentage_total": "5.88%",
|
|
|
|
|
"max_area": "9pxˆ2",
|
|
|
|
|
"min_area": "9pxˆ2",
|
|
|
|
|
"total_area": "18pxˆ2",
|
|
|
|
|
},
|
|
|
|
|
},
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
),
|
|
|
|
|
(
|
2024-03-06 15:58:10 -06:00
|
|
|
|
((4, 5), (4, 8), (7, 8), (7, 5)),
|
|
|
|
|
((2, 3), (2, 6), (5, 6), (5, 3)),
|
|
|
|
|
"Some lovely title",
|
|
|
|
|
"Some lovely title", # same title
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
1,
|
2024-03-06 15:58:10 -06:00
|
|
|
|
{
|
|
|
|
|
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
|
|
|
|
"parent_element": None,
|
|
|
|
|
"overlapping_case": "partial overlap with duplicate text",
|
|
|
|
|
"overlap_percentage": "11.11%",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"largest_ngram_percentage": 0,
|
|
|
|
|
"overlap_percentage_total": "5.88%",
|
|
|
|
|
"max_area": "9pxˆ2",
|
|
|
|
|
"min_area": "9pxˆ2",
|
|
|
|
|
"total_area": "18pxˆ2",
|
|
|
|
|
},
|
|
|
|
|
},
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
),
|
|
|
|
|
(
|
2024-03-06 15:58:10 -06:00
|
|
|
|
((4, 5), (4, 8), (7, 8), (7, 5)),
|
|
|
|
|
((2, 3), (2, 6), (5, 6), (5, 3)),
|
|
|
|
|
"Some lovely title",
|
|
|
|
|
"", # empty title
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
1,
|
2024-03-06 15:58:10 -06:00
|
|
|
|
{
|
|
|
|
|
"overlapping_elements": ["1. NarrativeText(ix=1)", "0. Title(ix=0)"],
|
|
|
|
|
"parent_element": None,
|
|
|
|
|
"overlapping_case": ("partial overlap with empty content in 1. NarrativeText"),
|
|
|
|
|
"overlap_percentage": "11.11%",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"largest_ngram_percentage": 0,
|
|
|
|
|
"overlap_percentage_total": "5.88%",
|
|
|
|
|
"max_area": "9pxˆ2",
|
|
|
|
|
"min_area": "9pxˆ2",
|
|
|
|
|
"total_area": "18pxˆ2",
|
|
|
|
|
},
|
|
|
|
|
},
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
),
|
|
|
|
|
(
|
2024-03-06 15:58:10 -06:00
|
|
|
|
((4, 5), (4, 8), (7, 8), (7, 5)),
|
|
|
|
|
((2, 3), (2, 6), (5, 6), (5, 3)),
|
|
|
|
|
"", # empty 1st title
|
|
|
|
|
"Some lovely title",
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
1,
|
2024-03-06 15:58:10 -06:00
|
|
|
|
{
|
|
|
|
|
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
|
|
|
|
"parent_element": None,
|
|
|
|
|
"overlapping_case": "partial overlap with empty content in 0. Title",
|
|
|
|
|
"overlap_percentage": "11.11%",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"largest_ngram_percentage": 0,
|
|
|
|
|
"overlap_percentage_total": "5.88%",
|
|
|
|
|
"max_area": "9pxˆ2",
|
|
|
|
|
"min_area": "9pxˆ2",
|
|
|
|
|
"total_area": "18pxˆ2",
|
|
|
|
|
},
|
|
|
|
|
},
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
),
|
|
|
|
|
(
|
2024-03-06 15:58:10 -06:00
|
|
|
|
((4, 5), (4, 8), (7, 8), (7, 5)),
|
|
|
|
|
((2, 3), (2, 6), (5, 6), (5, 3)),
|
|
|
|
|
"Some lovely title",
|
|
|
|
|
"Something totally different here", # diff text
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
1,
|
2024-03-06 15:58:10 -06:00
|
|
|
|
{
|
|
|
|
|
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
|
|
|
|
"parent_element": None,
|
|
|
|
|
"overlapping_case": "partial overlap without sharing text",
|
|
|
|
|
"overlap_percentage": "11.11%",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"largest_ngram_percentage": 0,
|
|
|
|
|
"overlap_percentage_total": "5.88%",
|
|
|
|
|
"max_area": "9pxˆ2",
|
|
|
|
|
"min_area": "9pxˆ2",
|
|
|
|
|
"total_area": "18pxˆ2",
|
|
|
|
|
},
|
|
|
|
|
},
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
),
|
|
|
|
|
(
|
2024-03-06 15:58:10 -06:00
|
|
|
|
((5, 6), (5, 10), (8, 10), (8, 6)), # diff coordinates
|
|
|
|
|
((1, 3), (2, 7), (6, 7), (5, 3)),
|
|
|
|
|
"Some lovely title",
|
|
|
|
|
"Some lovely text",
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
1,
|
2024-03-06 15:58:10 -06:00
|
|
|
|
{
|
|
|
|
|
"overlapping_elements": ["0. Title(ix=0)", "1. NarrativeText(ix=1)"],
|
|
|
|
|
"parent_element": None,
|
|
|
|
|
"overlapping_case": "Small partial overlap",
|
|
|
|
|
"overlap_percentage": "8.33%",
|
|
|
|
|
"metadata": {
|
|
|
|
|
"largest_ngram_percentage": 0,
|
|
|
|
|
"overlap_percentage_total": "3.23%",
|
|
|
|
|
"max_area": "20pxˆ2",
|
|
|
|
|
"min_area": "12pxˆ2",
|
|
|
|
|
"total_area": "32pxˆ2",
|
|
|
|
|
},
|
|
|
|
|
},
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
),
|
|
|
|
|
],
|
|
|
|
|
)
|
|
|
|
|
def test_catch_overlapping_and_nested_bboxes(
|
2024-03-06 15:58:10 -06:00
|
|
|
|
coords1, coords2, text1, text2, nested_error_tolerance_px, expectation
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
):
|
2024-03-06 15:58:10 -06:00
|
|
|
|
elements = [
|
|
|
|
|
Title(
|
|
|
|
|
text=text1,
|
|
|
|
|
coordinates=coords1,
|
|
|
|
|
coordinate_system=PixelSpace(width=20, height=20),
|
|
|
|
|
metadata=ElementMetadata(page_number=1),
|
|
|
|
|
),
|
|
|
|
|
NarrativeText(
|
|
|
|
|
text=text2,
|
|
|
|
|
coordinates=coords2,
|
|
|
|
|
coordinate_system=PixelSpace(width=20, height=20),
|
|
|
|
|
metadata=ElementMetadata(page_number=1),
|
|
|
|
|
),
|
|
|
|
|
]
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
|
|
|
|
|
elements,
|
|
|
|
|
nested_error_tolerance_px,
|
2024-03-06 15:58:10 -06:00
|
|
|
|
sm_overlap_threshold=10.0,
|
feat: method to catch and classify overlapping bounding boxes (#1803)
We have established that overlapping bounding boxes does not have a
one-fits-all solution, so different cases need to be handled differently
to avoid information loss. We have manually identified the
cases/categories of overlapping. Now we need a method to
programmatically classify overlapping-bboxes cases within detected
elements in a document, and return a report about it (list of cases with
metadata). This fits two purposes:
- **Evaluation**: We can have a pipeline using the DVC data registry
that assess the performance of a detection model against a set of
documents (PDF/Images), by analysing the overlapping-bboxes cases it
has. The metadata in the output can be used for generating metrics for
this.
- **Scope overlapping cases**: Manual inspection give us a clue about
currently present cases of overlapping bboxes. We need to propose
solutions to fix those on code. This method generates a report by
analysing several aspects of two overlapping regions. This data can be
used to profile and specify the necessary changes that will fix each
case.
- **Fix overlapping cases**: We could introduce this functionality in
the flow of a partition method (such as `partition_pdf`, to handle the
calls to post-processing methods to fix overlapping. Tested on ~331
documents, the worst time per page is around 5ms. For a document such as
`layout-parser-paper.pdf` it takes 4.46 ms.
Introduces functionality to take a list of unstructured elements (which
contain bounding boxes) and identify pairs of bounding boxes which
overlap and which case is pertinent to the pairing. This PR includes the
following methods in `utils.py`:
- **`ngrams(s, n)`**: Generate n-grams from a string
- **`calculate_shared_ngram_percentage(string_A, string_B, n)`**:
Calculate the percentage of `common_ngrams` between `string_A` and
`string_B` with reference to the total number of ngrams in `string_A`.
- **`calculate_largest_ngram_percentage(string_A, string_B)`**:
Iteratively call `calculate_shared_ngram_percentage` starting from the
biggest ngram possible until the shared percentage is >0.0%
- **`is_parent_box(parent_target, child_target, add=0)`**: True if the
`child_target` bounding box is nested in the `parent_target` Box format:
[`x_bottom_left`, `y_bottom_left`, `x_top_right`, `y_top_right`]. The
parameter 'add' is the pixel error tolerance for extra pixels outside
the parent region
- **`calculate_overlap_percentage(box1, box2,
intersection_ratio_method="total")`**: Box format: [`x_bottom_left`,
`y_bottom_left`, `x_top_right`, `y_top_right`]. Calculates the
percentage of overlapped region with reference to biggest element-region
(`intersection_ratio_method="parent"`), the smallest element-region
(`intersection_ratio_method="partial"`), or to the disjunctive union
region (`intersection_ratio_method="total"`).
- **`identify_overlapping_or_nesting_case`**: Identify if there are
nested or overlapping elements. If overlapping is present,
it identifies the case calling the method `identify_overlapping_case`.
- **`identify_overlapping_case`**: Classifies the overlapping case for
an element_pair input in one of 5 categories of overlapping.
- **`catch_overlapping_and_nested_bboxes`**: Catch overlapping and
nested bounding boxes cases across a list of elements. The params
`nested_error_tolerance_px` and `sm_overlap_threshold` help controling
the separation of the cases.
The overlapping/nested elements cases that are being caught are:
1. **Nested elements**
2. **Small partial overlap**
3. **Partial overlap with empty content**
4. **Partial overlap with duplicate text (sharing 100% of the text)**
5. **Partial overlap without sharing text**
6. **Partial overlap sharing**
{`calculate_largest_ngram_percentage(...)`}% **of the text**
Here is a snippet to test it:
```
from unstructured.partition.auto import partition
model_name = "yolox_quantized"
target = "sample-docs/layout-parser-paper-fast.pdf"
elements = partition(filename=file_path_i, strategy='hi_res', model_name=model_name)
overlapping_flag, overlapping_cases = catch_overlapping_bboxes(elements)
for case in overlapping_cases:
print(case, "\n")
```
Here is a screenshot of a json built with the output list
`overlapping_cases`:
<img width="377" alt="image"
src="https://github.com/Unstructured-IO/unstructured/assets/38184042/a6fea64b-d40a-4e01-beda-27840f4f4b3a">
2023-10-25 05:17:34 -07:00
|
|
|
|
)
|
2024-03-06 15:58:10 -06:00
|
|
|
|
assert overlapping_flag is True
|
|
|
|
|
assert overlapping_cases[0] == expectation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_catch_overlapping_and_nested_bboxes_non_overlapping_case():
|
|
|
|
|
elements = [
|
|
|
|
|
Title(
|
|
|
|
|
text="Some lovely title",
|
|
|
|
|
coordinates=((4, 6), (4, 7), (7, 7), (7, 6)),
|
|
|
|
|
coordinate_system=PixelSpace(width=20, height=20),
|
|
|
|
|
metadata=ElementMetadata(page_number=1),
|
|
|
|
|
),
|
|
|
|
|
NarrativeText(
|
|
|
|
|
text="Some lovely text",
|
|
|
|
|
coordinates=((6, 8), (6, 9), (9, 9), (9, 8)),
|
|
|
|
|
coordinate_system=PixelSpace(width=20, height=20),
|
|
|
|
|
metadata=ElementMetadata(page_number=1),
|
|
|
|
|
),
|
|
|
|
|
]
|
|
|
|
|
overlapping_flag, overlapping_cases = utils.catch_overlapping_and_nested_bboxes(
|
|
|
|
|
elements,
|
|
|
|
|
1,
|
|
|
|
|
sm_overlap_threshold=10.0,
|
|
|
|
|
)
|
|
|
|
|
assert overlapping_flag is False
|
|
|
|
|
assert overlapping_cases == []
|
2024-02-14 08:41:43 -05:00
|
|
|
|
|
|
|
|
|
|
2024-03-06 15:58:10 -06:00
|
|
|
|
def test_only_returns_singleton_iterable():
|
|
|
|
|
singleton_iterable = [42]
|
|
|
|
|
result = utils.only(singleton_iterable)
|
|
|
|
|
assert result == 42
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_only_raises_on_non_singleton_iterable():
|
|
|
|
|
singleton_iterable = [42, 0]
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
utils.only(singleton_iterable)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_calculate_shared_ngram_percentage_returns_null_vals_for_empty_str():
|
|
|
|
|
str1 = ""
|
|
|
|
|
str2 = "banana orange pineapple"
|
|
|
|
|
n = 2
|
|
|
|
|
percent, common_ngrams = utils.calculate_shared_ngram_percentage(str1, str2, n)
|
|
|
|
|
assert percent == 0
|
|
|
|
|
assert not bool(common_ngrams)
|