unstructured/test_unstructured/partition/html/test_html_to_ontology_parsing.py

from bs4 import BeautifulSoup

from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page
from unstructured.partition.html.html_utils import indent_html
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology


def _wrap_with_body(html: str) -> str:
    return f'<body class="Document">{html}</body>'


def remove_all_ids(html_str):
    soup = BeautifulSoup(html_str, "html.parser")
    for tag in soup.find_all(True):
        if tag.has_attr("id"):
            del tag["id"]
    return str(soup)


def test_wrong_html_parser_causes_paragraph_to_be_nested_in_div():
    # This test would fail if html5lib parser would be applied on the input HTML.
    # It would result in Page: <p></p> <address></address>
    #         instead of Page: <p><address></address></p>

    # language=HTML
    input_html = """
    <div class="Page">
    <p class="NarrativeText">
     <address class="Address">
      Mountain View, California
     </address>
    </p>
    </div>
    """
    page = parse_html_to_ontology(input_html)

    assert len(page.children) == 1
    narrative_text = page.children[0]

    assert len(narrative_text.children) == 1
    address = narrative_text.children[0]

    assert address.text == "Mountain View, California"


def test_when_class_is_missing_it_can_be_inferred_from_type():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        <aside>Some text</aside>
    </div>
    """
    )

    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
    </div>
    """
    )
    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_when_class_is_wrong_tag_name_is_overwritten():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        <p class='Sidebar'>Some text</p>
    </div>
    """
    )

    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
    </div>
    """
    )
    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_when_tag_not_supported_by_ontology_and_wrong_then_consider_them_text():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        <newtag class="wrongclass">Some text
        </newtag>
    </div>
    """
    )

    base_html = indent_html(base_html)

    # TODO (Pluto): Maybe it should be considered as plain text?

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <span class="UncategorizedText">Some text
        </span>
    </div>
    """
    )
    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_div_are_ignored_when_no_attrs():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
      <div>
           <input class="RadioButton" name="health-comparison" type="radio"/>
      </div>
    </div>
    """
    )

    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
       <input class="RadioButton" name="health-comparison" type="radio"/>
    </div>
    """
    )
    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_ids_are_preserved():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
      <div style="background-color: lightblue" id="important_div">
           <input class="RadioButton" name="health-comparison" type="radio"/>
      </div>
    </div>
    """
    )
    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
      <div class="UncategorizedText" style="background-color: lightblue" id="important_div">
           <input class="RadioButton" name="health-comparison" type="radio"/>
      </div>
    </div>
    """
    )
    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)

    page = ontology.children[0]
    div_obj = page.children[0]
    assert div_obj.additional_attributes["id"] == "important_div"


def test_br_is_not_considered_uncategorized_text():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        <br/>
    </div>
    """
    )
    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <br/>
    </div>
    """
    )

    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_text_without_tag_is_marked_as_uncategorized_text_when_there_are_other_elements():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        About the same
        <input class="RadioButton" name="health-comparison" type="radio"/>
        Some text
    </div>
    """
    )

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <p class="Paragraph">
            About the same
        </p>
        <input class="RadioButton" name="health-comparison" type="radio"/>
        <p class="Paragraph">
            Some text
        </p>
    </div>
    """
    )
    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_keyword_only_attributes_are_preserved_during_mapping():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <input class="FormFieldValue" type="radio" name="options" value="2" checked>
    """
    )  # noqa: E501
    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <input class="FormFieldValue" type="radio" name="options" value="2" checked>
    """
    )  # noqa: E501

    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mapping():
    # <input> can be assigned to multiple classes so it is not clear what it is
    #  thus we assign it to UncategorizedText

    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        <form class="Form">
            <label class="FormField" for="option1">
                <input type="radio" name="option1" value="2" checked>
                <span class="UncategorizedText">
                    Option 1 (Checked)
                </span>
            </label>
        </form>
    </div>
    """
    )
    base_html = indent_html(base_html)

    # TODO(Pluto): Maybe tag also should be overwritten? Or just leave it as it is?
    #  We classify <input> as UncategorizedText but all the text is preserved
    #  for UnstructuredElement so it make sense now as well

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <form class="Form">
            <label class="FormField" for="option1">
                <input class="Checkbox" type="radio" name="option1" value="2" checked />
                <span class="UncategorizedText">
                    Option 1 (Checked)
                </span>
            </label>
        </form>
    </div>
    """
    )  # noqa: E501

    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_broken_cell_is_not_raising_error():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        <table class="Table">
            <tbody class="TableBody">
                <tr class="TableRow">
                   <td class="TableCell&gt;11,442,231&lt;/td&gt;&lt;td class=" tablecell"="">
                    83.64 GiB
                   </td>
                  <th class="TableCellHeader" rowspan="2">
                    Fair Value
                 </th>
               </tr>
           </tbody>
        </table>
    </div>
    """
    )
    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <table class="Table">
            <tbody>
                <tr>
                   <td tablecell&quot;="">
                    83.64 GiB
                   </td>
                  <th rowspan="2">
                    Fair Value
                 </th>
               </tr>
           </tbody>
        </table>
    </div>
    """
    )

    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_table():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        <table class="Table">
            <tbody class="TableBody">
                <tr class="TableRow">
                  <td class="TableCell">
                    Fair Value1
                  </td>
                  <th class="TableCellHeader" rowspan="2">
                    Fair Value2
                 </th>
               </tr>
           </tbody>
        </table>
    </div>
    """
    )
    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <table class="Table">
            <tbody>
                <tr>
                  <td>
                    Fair Value1
                  </td>
                  <th rowspan="2">
                    Fair Value2
                 </th>
               </tr>
           </tbody>
        </table>
    </div>
    """
    )

    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_table_and_time():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        <table class="Table">
            <thead class='TableHeader'>
                <tr class="TableRow">
                    <th class="TableCellHeader"  colspan="6">
                        Carrying Value
                    </th>
                </tr>
            </thead>
            <tbody class='TableBody'>
                <tr class="TableRow">
                    <td class="TableCell" colspan="5">
                        <time class="CalendarDate">
                            June 30, 2023
                        </time>
                    </td>
                <td class="TableCell">
                    <span class="Currency">
                        $—
                    </span>
                </td>
                </tr>
            </tbody>
        </table>
    </div>
    """
    )
    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <table class="Table">
            <thead>
                <tr>
                    <th colspan="6">
                        Carrying Value
                    </th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td colspan="5">
                        <time>
                            June 30, 2023
                        </time>
                    </td>
                <td>
                    <span>
                        $—
                    </span>
                </td>
                </tr>
            </tbody>
        </table>
    </div>
    """
    )

    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_malformed_html():
    # language=HTML
    input_html = """
    <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
    <html>
      <head>
        <title>Super Malformed HTML</title>
      </head>
      <body class="Document">
        <!-- Unclosed comment
        <div class=>
          <p>Paragraph with missing closing angle bracket
        <div>
          <span>
            <p>Improperly nested paragraph within a span</span>
          </p>
        </div>
        <script>
          var x = "Unclosed script tag example;
        </script>
        <p>Paragraph with invalid characters: <EFBFBD> <EFBFBD> <EFBFBD></p>
      </div>
    </html>
    """

    # Such malformed HTML won't be returned by html_partitioning as it uses html5lib parser
    # to imitate the same behaviour it will be first parsed the same way

    input_html = indent_html(input_html, html_parser="html5lib")

    # Ontology has 1 element and everything inside is just Text
    # language=HTML
    expected_html = """
    <body class="Document">

      <p class="Paragraph">
     Unclosed comment
     <div class="">
      <p>
       Paragraph with missing closing angle bracket
       <div>
        <span>
         <p>
          Improperly nested paragraph within a span
         </p>
        </span>
       </div>
      </p>
     </div>
     <script>
      var x = "Unclosed script tag example;
     </script>
     <p>
      Paragraph with invalid characters: <EFBFBD> <EFBFBD> <EFBFBD>
     </p>
     </p>
    </body>
    """

    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(input_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_text_is_wrapped_inside_layout_element():
    # language=HTML
    base_html = _wrap_with_body(
        """
    <div class="Page">
        Text
    </div>
    """
    )
    base_html = indent_html(base_html)

    # language=HTML
    expected_html = _wrap_with_body(
        """
    <div class="Page">
        <p class='Paragraph'>Text</p>
    </div>
    """
    )

    expected_html = indent_html(expected_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)
    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))

    assert parsed_ontology == expected_html


def test_text_in_form_field_value():
    # language=HTML
    input_html = """
    <div class="Page">
    <input class="FormFieldValue" value="Random Input Value"/>
    </div>
    """
    page = parse_html_to_ontology(input_html)

    assert len(page.children) == 1
    form_field_value = page.children[0]
    assert form_field_value.text == ""
    assert form_field_value.to_text() == "Random Input Value"


def test_text_in_form_field_value_with_null_value():
    # language=HTML
    input_html = """
    <div class="Page">
    <input class="FormFieldValue" value=""/>
    </div>
    """
    page = parse_html_to_ontology(input_html)

    assert len(page.children) == 1
    form_field_value = page.children[0]
    assert form_field_value.text == ""
    assert form_field_value.to_text() == ""


def test_to_text_when_form_field():
    ontology = Page(
        children=[
            Form(
                tag="input",
                additional_attributes={"value": "Random Input Value"},
                children=[
                    FormFieldValue(
                        tag="input",
                        additional_attributes={"value": "Random Input Value"},
                    )
                ],
            )
        ]
    )
    assert ontology.to_text(add_children=True) == "Random Input Value"


def test_recursion_limit_is_limiting_parsing():
    # language=HTML
    broken_html = "some text"
    for i in range(100):
        broken_html = f"<p class='Paragraph'>{broken_html}</p>"
    broken_html = _wrap_with_body(broken_html)
    ontology = parse_html_to_ontology(broken_html)

    iterator = 1
    last_child = ontology.children[0]
    while last_child.children:
        last_child = last_child.children[0]
        iterator += 1
    assert last_child.text.startswith('<p class="Paragraph">')
    assert iterator == RECURSION_LIMIT


def test_get_text_when_recursion_limit_activated():
    broken_html = "some text"
    for i in range(100):
        broken_html = f"<p class='Paragraph'>{broken_html}</p>"
    broken_html = _wrap_with_body(broken_html)
    ontology = parse_html_to_ontology(broken_html)

    last_child = ontology.children[0]
    while last_child.children:
        last_child = last_child.children[0]

    assert last_child.to_text() == "some text"


def test_uncategorizedtest_has_image_and_no_text():
    # language=HTML
    base_html = _wrap_with_body(
        """
        <div class="Page">
    <div class="UncategorizedText">
        <img src="https://www.example.com/image.jpg"/>
    </div>
    </div>
    """
    )

    base_html = indent_html(base_html)

    ontology: OntologyElement = parse_html_to_ontology(base_html)

    element = ontology.children[0].children[0]
    assert type(element) is Image
    assert element.css_class_name == "Image"
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								from bs4 import BeautifulSoup
-												Image within div or span with no text is annotated as Image (#3962)

Ticket: https://unstructured-ai.atlassian.net/browse/ML-942

The following uncompressed HTML document can be used to test the
transformation using the `partition_html` function from the VLM
partitioner.


[recalibrating-risk-report.pdf.json.html.zip](https://github.com/user-attachments/files/19330528/recalibrating-risk-report.pdf.json.html.zip)
											
										
										
											2025-03-19 21:09:02 -07:00
+								from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								from unstructured.partition.html.html_utils import indent_html
-												Add max recursion limit and fix to_text() method (#3773)


											
										
										
											2024-11-07 16:08:16 +01:00
+								from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
 								def _wrap_with_body(html: str) -> str:
 								    return f'<body class="Document">{html}</body>'
 								def remove_all_ids(html_str):
 								    soup = BeautifulSoup(html_str, "html.parser")
 								    for tag in soup.find_all(True):
 								        if tag.has_attr("id"):
 								            del tag["id"]
 								    return str(soup)
 								def test_wrong_html_parser_causes_paragraph_to_be_nested_in_div():
 								    # This test would fail if html5lib parser would be applied on the input HTML.
 								    # It would result in Page: <p></p> <address></address>
 								    #         instead of Page: <p><address></address></p>
 								    # language=HTML
 								    input_html = """
 								    <div class="Page">
 								    <p class="NarrativeText">
 								     <address class="Address">
 								      Mountain View, California
 								     </address>
 								    </p>
 								    </div>
 								    """
 								    page = parse_html_to_ontology(input_html)
 								    assert len(page.children) == 1
 								    narrative_text = page.children[0]
 								    assert len(narrative_text.children) == 1
 								    address = narrative_text.children[0]
 								    assert address.text == "Mountain View, California"
 								def test_when_class_is_missing_it_can_be_inferred_from_type():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <aside>Some text</aside>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
-												Fix layout parsing (#3754)


											
										
										
											2024-10-25 16:42:06 +02:00
+								        <aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_when_class_is_wrong_tag_name_is_overwritten():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <p class='Sidebar'>Some text</p>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
-												Fix layout parsing (#3754)


											
										
										
											2024-10-25 16:42:06 +02:00
+								        <aside class='Sidebar'><p class='Paragraph'>Some text</p></aside>
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_when_tag_not_supported_by_ontology_and_wrong_then_consider_them_text():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <newtag class="wrongclass">Some text
 								        </newtag>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # TODO (Pluto): Maybe it should be considered as plain text?
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <span class="UncategorizedText">Some text
 								        </span>
 								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_div_are_ignored_when_no_attrs():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								      <div>
 								           <input class="RadioButton" name="health-comparison" type="radio"/>
 								      </div>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								       <input class="RadioButton" name="health-comparison" type="radio"/>
 								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_ids_are_preserved():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								      <div style="background-color: lightblue" id="important_div">
 								           <input class="RadioButton" name="health-comparison" type="radio"/>
 								      </div>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								      <div class="UncategorizedText" style="background-color: lightblue" id="important_div">
 								           <input class="RadioButton" name="health-comparison" type="radio"/>
 								      </div>
 								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    page = ontology.children[0]
 								    div_obj = page.children[0]
 								    assert div_obj.additional_attributes["id"] == "important_div"
 								def test_br_is_not_considered_uncategorized_text():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <br/>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <br/>
 								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_text_without_tag_is_marked_as_uncategorized_text_when_there_are_other_elements():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        About the same
 								        <input class="RadioButton" name="health-comparison" type="radio"/>
 								        Some text
 								    </div>
 								    """
 								    )
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <p class="Paragraph">
 								            About the same
 								        </p>
 								        <input class="RadioButton" name="health-comparison" type="radio"/>
 								        <p class="Paragraph">
 								            Some text
 								        </p>
 								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_keyword_only_attributes_are_preserved_during_mapping():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <input class="FormFieldValue" type="radio" name="options" value="2" checked>
 								    """
 								    )  # noqa: E501
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <input class="FormFieldValue" type="radio" name="options" value="2" checked>
 								    """
 								    )  # noqa: E501
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mapping():
 								    # <input> can be assigned to multiple classes so it is not clear what it is
 								    #  thus we assign it to UncategorizedText
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <form class="Form">
 								            <label class="FormField" for="option1">
 								                <input type="radio" name="option1" value="2" checked>
 								                <span class="UncategorizedText">
 								                    Option 1 (Checked)
 								                </span>
 								            </label>
 								        </form>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # TODO(Pluto): Maybe tag also should be overwritten? Or just leave it as it is?
 								    #  We classify <input> as UncategorizedText but all the text is preserved
 								    #  for UnstructuredElement so it make sense now as well
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <form class="Form">
 								            <label class="FormField" for="option1">
-												Define default HTML to ontology mapping (#3784)


											
										
										
											2024-11-20 14:01:28 +01:00
+								                <input class="Checkbox" type="radio" name="option1" value="2" checked />
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								                <span class="UncategorizedText">
 								                    Option 1 (Checked)
 								                </span>
 								            </label>
 								        </form>
 								    </div>
 								    """
 								    )  # noqa: E501
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_broken_cell_is_not_raising_error():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <table class="Table">
 								            <tbody class="TableBody">
 								                <tr class="TableRow">
 								                   <td class="TableCell&gt;11,442,231&lt;/td&gt;&lt;td class=" tablecell"="">
 .64 GiB
 								                   </td>
 								                  <th class="TableCellHeader" rowspan="2">
 								                    Fair Value
 								                 </th>
 								               </tr>
 								           </tbody>
 								        </table>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <table class="Table">
-												ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
											
										
										
											2024-10-31 02:30:53 +01:00
+								            <tbody>
 								                <tr>
 								                   <td tablecell&quot;="">
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+.64 GiB
 								                   </td>
-												ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
											
										
										
											2024-10-31 02:30:53 +01:00
+								                  <th rowspan="2">
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								                    Fair Value
 								                 </th>
 								               </tr>
 								           </tbody>
 								        </table>
 								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_table():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <table class="Table">
 								            <tbody class="TableBody">
 								                <tr class="TableRow">
 								                  <td class="TableCell">
 								                    Fair Value1
 								                  </td>
 								                  <th class="TableCellHeader" rowspan="2">
 								                    Fair Value2
 								                 </th>
 								               </tr>
 								           </tbody>
 								        </table>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <table class="Table">
-												ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
											
										
										
											2024-10-31 02:30:53 +01:00
+								            <tbody>
 								                <tr>
 								                  <td>
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								                    Fair Value1
 								                  </td>
-												ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
											
										
										
											2024-10-31 02:30:53 +01:00
+								                  <th rowspan="2">
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								                    Fair Value2
 								                 </th>
 								               </tr>
 								           </tbody>
 								        </table>
 								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_table_and_time():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <table class="Table">
 								            <thead class='TableHeader'>
 								                <tr class="TableRow">
 								                    <th class="TableCellHeader"  colspan="6">
 								                        Carrying Value
 								                    </th>
 								                </tr>
 								            </thead>
 								            <tbody class='TableBody'>
 								                <tr class="TableRow">
 								                    <td class="TableCell" colspan="5">
 								                        <time class="CalendarDate">
 								                            June 30, 2023
 								                        </time>
 								                    </td>
 								                <td class="TableCell">
 								                    <span class="Currency">
 								                        $—
 								                    </span>
 								                </td>
 								                </tr>
 								            </tbody>
 								        </table>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <table class="Table">
-												ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
											
										
										
											2024-10-31 02:30:53 +01:00
+								            <thead>
 								                <tr>
 								                    <th colspan="6">
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								                        Carrying Value
 								                    </th>
 								                </tr>
 								            </thead>
-												ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
											
										
										
											2024-10-31 02:30:53 +01:00
+								            <tbody>
 								                <tr>
 								                    <td colspan="5">
-												Fix parsing table cells (#3904)

This PR:
- Fixes removing HTML tags that exist in <td> cells 
- stripping function was in general problematic to implement in easy and
straightforward way (you can't modify `descendants` in-place). So I
decided instead of patching something in table cell I added stripping
everywhere in the same consistent way. This is why some tests needed
small edits with removing one white-space in each tag. I believe this
won't cause any problems for downstream tasks.

Tested HTML:
```html
<table class="Table">
    <tbody>
        <tr>
            <td colspan="2">
                Some text                                        
            </td>
            <td>
                <input checked="" class="Checkbox" type="checkbox"/>
            </td>
        </tr>
    </tbody>
</table>
```
Before & After
```html
'<table class="Table" id="..."> <tbody> <tr> <td colspan="2">Some text</td><td></td></tr></tbody></table>'
'<table class="Table" id="..."><tbody><tr><td colspan="2">Some text</td><td><input checked="" type="checkbox"/></td></tr></tbody></table>''
```
											
										
										
											2025-02-05 16:28:49 +01:00
+								                        <time>
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								                            June 30, 2023
-												Fix parsing table cells (#3904)

This PR:
- Fixes removing HTML tags that exist in <td> cells 
- stripping function was in general problematic to implement in easy and
straightforward way (you can't modify `descendants` in-place). So I
decided instead of patching something in table cell I added stripping
everywhere in the same consistent way. This is why some tests needed
small edits with removing one white-space in each tag. I believe this
won't cause any problems for downstream tasks.

Tested HTML:
```html
<table class="Table">
    <tbody>
        <tr>
            <td colspan="2">
                Some text                                        
            </td>
            <td>
                <input checked="" class="Checkbox" type="checkbox"/>
            </td>
        </tr>
    </tbody>
</table>
```
Before & After
```html
'<table class="Table" id="..."> <tbody> <tr> <td colspan="2">Some text</td><td></td></tr></tbody></table>'
'<table class="Table" id="..."><tbody><tr><td colspan="2">Some text</td><td><input checked="" type="checkbox"/></td></tr></tbody></table>''
```
											
										
										
											2025-02-05 16:28:49 +01:00
+								                        </time>
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								                    </td>
-												ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
											
										
										
											2024-10-31 02:30:53 +01:00
+								                <td>
-												Fix parsing table cells (#3904)

This PR:
- Fixes removing HTML tags that exist in <td> cells 
- stripping function was in general problematic to implement in easy and
straightforward way (you can't modify `descendants` in-place). So I
decided instead of patching something in table cell I added stripping
everywhere in the same consistent way. This is why some tests needed
small edits with removing one white-space in each tag. I believe this
won't cause any problems for downstream tasks.

Tested HTML:
```html
<table class="Table">
    <tbody>
        <tr>
            <td colspan="2">
                Some text                                        
            </td>
            <td>
                <input checked="" class="Checkbox" type="checkbox"/>
            </td>
        </tr>
    </tbody>
</table>
```
Before & After
```html
'<table class="Table" id="..."> <tbody> <tr> <td colspan="2">Some text</td><td></td></tr></tbody></table>'
'<table class="Table" id="..."><tbody><tr><td colspan="2">Some text</td><td><input checked="" type="checkbox"/></td></tr></tbody></table>''
```
											
										
										
											2025-02-05 16:28:49 +01:00
+								                    <span>
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								                        $—
-												Fix parsing table cells (#3904)

This PR:
- Fixes removing HTML tags that exist in <td> cells 
- stripping function was in general problematic to implement in easy and
straightforward way (you can't modify `descendants` in-place). So I
decided instead of patching something in table cell I added stripping
everywhere in the same consistent way. This is why some tests needed
small edits with removing one white-space in each tag. I believe this
won't cause any problems for downstream tasks.

Tested HTML:
```html
<table class="Table">
    <tbody>
        <tr>
            <td colspan="2">
                Some text                                        
            </td>
            <td>
                <input checked="" class="Checkbox" type="checkbox"/>
            </td>
        </tr>
    </tbody>
</table>
```
Before & After
```html
'<table class="Table" id="..."> <tbody> <tr> <td colspan="2">Some text</td><td></td></tr></tbody></table>'
'<table class="Table" id="..."><tbody><tr><td colspan="2">Some text</td><td><input checked="" type="checkbox"/></td></tr></tbody></table>''
```
											
										
										
											2025-02-05 16:28:49 +01:00
+								                    </span>
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								                </td>
 								                </tr>
 								            </tbody>
 								        </table>
 								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
 								def test_malformed_html():
 								    # language=HTML
 								    input_html = """
 								    <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
 								    <html>
 								      <head>
 								        <title>Super Malformed HTML</title>
 								      </head>
 								      <body class="Document">
 								        <!-- Unclosed comment
 								        <div class=>
 								          <p>Paragraph with missing closing angle bracket
 								        <div>
 								          <span>
 								            <p>Improperly nested paragraph within a span</span>
 								          </p>
 								        </div>
 								        <script>
 								          var x = "Unclosed script tag example;
 								        </script>
 								        <p>Paragraph with invalid characters: <EFBFBD> <EFBFBD> <EFBFBD></p>
 								      </div>
 								    </html>
 								    """
 								    # Such malformed HTML won't be returned by html_partitioning as it uses html5lib parser
 								    # to imitate the same behaviour it will be first parsed the same way
 								    input_html = indent_html(input_html, html_parser="html5lib")
 								    # Ontology has 1 element and everything inside is just Text
 								    # language=HTML
 								    expected_html = """
 								    <body class="Document">
-												Fix layout parsing (#3754)


											
										
										
											2024-10-25 16:42:06 +02:00
 								      <p class="Paragraph">
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								     Unclosed comment
 								     <div class="">
 								      <p>
 								       Paragraph with missing closing angle bracket
 								       <div>
 								        <span>
 								         <p>
 								          Improperly nested paragraph within a span
 								         </p>
 								        </span>
 								       </div>
 								      </p>
 								     </div>
 								     <script>
 								      var x = "Unclosed script tag example;
 								     </script>
 								     <p>
 								      Paragraph with invalid characters: <EFBFBD> <EFBFBD> <EFBFBD>
 								     </p>
-												Fix layout parsing (#3754)


											
										
										
											2024-10-25 16:42:06 +02:00
+								     </p>
-												Add parsing HTML to unstructured elements (#3732)

> This is POC change; not everything is working correctly and code
quality could be improved significantly

This ticket add parsing HTML to unstructured element and back. How is it
working?

HTML has a tree structure, Unstructured Elements is a list.
HTML structure is traversed in DFS order, creating Elements and adding
them to list. So the reading order from HTML is preserved. To be able to
compose tree again all elements has IDs, and metadata.parent_id is
leveraged

How html is preserved if there are 'layout' without text, or there are
deeply nested HTMLs that are just text from the point of view of
Unstructured Element?
Each element is parsed back to HTML using metadata.text_as_html field.
For layout elements only html_tag are there, for long text elements
there is everything required to recreate HTML - you can see examples in
unit tests or .json file I attached.

Pros of solution:
- Nothing had to be changed in element types 

Cons:
- There are elements without Text which may be confusing (they could be
replaced by some special type)

Core transformation logic can be found in 2 functions in
`unstructured/documents/transformations.py`

Knowns bugs (they are minor):
- sometimes html tag is changed incorrectly
- metadata.category_depth and metadata.page_number are not set
- page break is not added between pages 

How to test. Generate HTML:
```python3
from pathlib import Path

from vlm_partitioner.src.partition import partition

if __name__ == "__main__":
    doc_dir = Path("out_dir")
    file_path = Path("example_doc.pdf")
    partition(str(file_path), provider="anthropic", output_dir=str(doc_dir))
```

Then parse to unstructured elements and back to html
```python3
from pathlib import Path

from unstructured.documents.html_utils import indent_html
from unstructured.documents.transformations import parse_html_to_ontology, ontology_to_unstructured_elements, \
    unstructured_elements_to_ontology
from unstructured.staging.base import elements_to_json

if __name__ == "__main__":
    output_dir = Path("out_dir/")
    output_dir.mkdir(exist_ok=True, parents=True)

    doc_path = Path("out_dir/example_doc.html")

    html_content = doc_path.read_text()

    ontology = parse_html_to_ontology(html_content)
    unstructured_elements = ontology_to_unstructured_elements(ontology)

    elements_to_json(unstructured_elements, str(output_dir / f"{doc_path.stem}_unstr.json"))

    parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
    html_to_save = indent_html(parsed_ontology.to_html())

    Path(output_dir / f"{doc_path.stem}_parsed_unstr.html").write_text(html_to_save)
```

I attached example doc before and after running these scripts

[outputs.zip](https://github.com/user-attachments/files/17438673/outputs.zip)
											
										
										
											2024-10-23 14:28:07 +02:00
+								    </body>
 								    """
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(input_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
-												Fix layout parsing (#3754)


											
										
										
											2024-10-25 16:42:06 +02:00
 								def test_text_is_wrapped_inside_layout_element():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        Text
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    # language=HTML
 								    expected_html = _wrap_with_body(
 								        """
 								    <div class="Page">
 								        <p class='Paragraph'>Text</p>
 								    </div>
 								    """
 								    )
 								    expected_html = indent_html(expected_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    parsed_ontology = indent_html(remove_all_ids(ontology.to_html()))
 								    assert parsed_ontology == expected_html
-												ML-405/ML-427 - OntologyElement improvements (#3758)

- the "value" attribute from <input/> tag will be taken into account and
processed as "text" in ontology
- the tables will now be parsed without any ids and classes - we have
different reasons behind that, for example, embeddings with ids and
classes can lose some semantic value. Also, more tokens = more expensive
LLM call
-  cleaned to_html, created to_text for OntologyElement
											
										
										
											2024-10-31 02:30:53 +01:00
 								def test_text_in_form_field_value():
 								    # language=HTML
 								    input_html = """
 								    <div class="Page">
 								    <input class="FormFieldValue" value="Random Input Value"/>
 								    </div>
 								    """
 								    page = parse_html_to_ontology(input_html)
 								    assert len(page.children) == 1
 								    form_field_value = page.children[0]
 								    assert form_field_value.text == ""
 								    assert form_field_value.to_text() == "Random Input Value"
-												Add max recursion limit and fix to_text() method (#3773)


											
										
										
											2024-11-07 16:08:16 +01:00
-												Fix extracting value from field (#3774)


											
										
										
											2024-11-07 19:21:39 +01:00
+								def test_text_in_form_field_value_with_null_value():
 								    # language=HTML
 								    input_html = """
 								    <div class="Page">
 								    <input class="FormFieldValue" value=""/>
 								    </div>
 								    """
 								    page = parse_html_to_ontology(input_html)
 								    assert len(page.children) == 1
 								    form_field_value = page.children[0]
 								    assert form_field_value.text == ""
 								    assert form_field_value.to_text() == ""
-												Add max recursion limit and fix to_text() method (#3773)


											
										
										
											2024-11-07 16:08:16 +01:00
+								def test_to_text_when_form_field():
 								    ontology = Page(
 								        children=[
 								            Form(
 								                tag="input",
 								                additional_attributes={"value": "Random Input Value"},
 								                children=[
 								                    FormFieldValue(
 								                        tag="input",
 								                        additional_attributes={"value": "Random Input Value"},
 								                    )
 								                ],
 								            )
 								        ]
 								    )
 								    assert ontology.to_text(add_children=True) == "Random Input Value"
 								def test_recursion_limit_is_limiting_parsing():
 								    # language=HTML
 								    broken_html = "some text"
 								    for i in range(100):
 								        broken_html = f"<p class='Paragraph'>{broken_html}</p>"
 								    broken_html = _wrap_with_body(broken_html)
 								    ontology = parse_html_to_ontology(broken_html)
 								    iterator = 1
 								    last_child = ontology.children[0]
 								    while last_child.children:
 								        last_child = last_child.children[0]
 								        iterator += 1
 								    assert last_child.text.startswith('<p class="Paragraph">')
 								    assert iterator == RECURSION_LIMIT
 								def test_get_text_when_recursion_limit_activated():
 								    broken_html = "some text"
 								    for i in range(100):
 								        broken_html = f"<p class='Paragraph'>{broken_html}</p>"
 								    broken_html = _wrap_with_body(broken_html)
 								    ontology = parse_html_to_ontology(broken_html)
 								    last_child = ontology.children[0]
 								    while last_child.children:
 								        last_child = last_child.children[0]
 								    assert last_child.to_text() == "some text"
-												Image within div or span with no text is annotated as Image (#3962)

Ticket: https://unstructured-ai.atlassian.net/browse/ML-942

The following uncompressed HTML document can be used to test the
transformation using the `partition_html` function from the VLM
partitioner.


[recalibrating-risk-report.pdf.json.html.zip](https://github.com/user-attachments/files/19330528/recalibrating-risk-report.pdf.json.html.zip)
											
										
										
											2025-03-19 21:09:02 -07:00
 								def test_uncategorizedtest_has_image_and_no_text():
 								    # language=HTML
 								    base_html = _wrap_with_body(
 								        """
 								        <div class="Page">
 								    <div class="UncategorizedText">
 								        <img src="https://www.example.com/image.jpg"/>
 								    </div>
 								    </div>
 								    """
 								    )
 								    base_html = indent_html(base_html)
 								    ontology: OntologyElement = parse_html_to_ontology(base_html)
 								    element = ontology.children[0].children[0]
 								    assert type(element) is Image
 								    assert element.css_class_name == "Image"