mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-04 06:50:02 +00:00

In this pull request parent-child relationship for elements generated with v2 parser is based on actual element IDs instead of IDs baked somewhere in the HTML script. With some extra bug fixing it allowed for significantly simplifying json -> HTML script
162 lines
5.2 KiB
JSON
162 lines
5.2 KiB
JSON
[
|
|
{
|
|
"element_id": "3a6b156a81764e17be128264241f8136",
|
|
"metadata": {
|
|
"category_depth": 0,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "897a8a47377c4ad6aab839a929879537",
|
|
"text_as_html": "<div class=\"Page\" data-page-number=\"1\" />"
|
|
},
|
|
"text": "",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "45b3d0053468484ba1c7b53998115412",
|
|
"metadata": {
|
|
"category_depth": 1,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "3a6b156a81764e17be128264241f8136",
|
|
"text_as_html": "<header class=\"Header\" />"
|
|
},
|
|
"text": "",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "c95473e8a3704fc2b418697f9fddb27b",
|
|
"metadata": {
|
|
"category_depth": 2,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
|
"text_as_html": "<h1 class=\"Title\">Header</h1>"
|
|
},
|
|
"text": "Header",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "379cbfdc16d44bd6a59e6cfabe6438d5",
|
|
"metadata": {
|
|
"category_depth": 2,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "45b3d0053468484ba1c7b53998115412",
|
|
"text_as_html": "<time class=\"CalendarDate\">Date: October 30, 2023</time>"
|
|
},
|
|
"text": "Date: October 30, 2023",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "637c2f6935fb4353a5f73025ce04619d",
|
|
"metadata": {
|
|
"category_depth": 1,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "3a6b156a81764e17be128264241f8136",
|
|
"text_as_html": "<form class=\"Form\"><label class=\"FormField\" for=\"company-name\">From field name</label><input class=\"FormFieldValue\" value=\"Example value\" /></form>"
|
|
},
|
|
"text": "From field name Example value",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "592422373ed741b68a077e2003f8ed81",
|
|
"metadata": {
|
|
"category_depth": 1,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "3a6b156a81764e17be128264241f8136",
|
|
"text_as_html": "<section class=\"Section\" />"
|
|
},
|
|
"text": "",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "dc3792d4422e444f90876b56d0cfb20d",
|
|
"metadata": {
|
|
"category_depth": 2,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "592422373ed741b68a077e2003f8ed81",
|
|
"text_as_html": "<table class=\"Table\"><thead><tr><th>Description</th><th>Row header</th></tr></thead><tbody><tr><td>Value description</td><td><span>50 $</span><span>(1.32 %)</span></td></tr></tbody></table>"
|
|
},
|
|
"text": "Description Row header Value description 50 $ (1.32 %)",
|
|
"type": "Table"
|
|
},
|
|
{
|
|
"element_id": "1032242af75c4b37984ea7fea9aac74c",
|
|
"metadata": {
|
|
"category_depth": 1,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "3a6b156a81764e17be128264241f8136",
|
|
"text_as_html": "<section class=\"Section\" />"
|
|
},
|
|
"text": "",
|
|
"type": "UncategorizedText"
|
|
},
|
|
{
|
|
"element_id": "2a4e2c4a689f4f9a8c180b6b521e45c3",
|
|
"metadata": {
|
|
"category_depth": 2,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
|
|
"text_as_html": "<h2 class=\"Subtitle\">2. Subtitle</h2>"
|
|
},
|
|
"text": "2. Subtitle",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "5591f7a4df01447e82515ce45f686fbe",
|
|
"metadata": {
|
|
"category_depth": 2,
|
|
"filename": "example.pdf",
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"page_number": 1,
|
|
"parent_id": "1032242af75c4b37984ea7fea9aac74c",
|
|
"text_as_html": "<p class=\"NarrativeText\">Paragraph text</p>"
|
|
},
|
|
"text": "Paragraph text",
|
|
"type": "NarrativeText"
|
|
}
|
|
] |