mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-07 21:00:29 +00:00
### Summary Rip off page_number metadata fields until we have page counting for all kinds of html files (not just limited to news articles with multiple `<article>` tag) ### Test Unit tests `test_add_chunking_strategy_on_partition_html_respects_multipage` and `test_add_chunking_strategy_title_on_partition_auto_respects_multipage` removed since they relay on the `page_number` fields from the SEC html file - now test moved to mock test for chunk_by_title -> revisit those tests when we find test file for this Also changed the element ids from partition outputs for html files - element id change due to page number change (in element id hashing) -> todo ticket: update other deterministic element id tests per crag's comment --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
90 lines
2.1 KiB
JSON
90 lines
2.1 KiB
JSON
[
|
||
{
|
||
"element_id": "a59f117741c76dca0bc8f5ee72e2010b",
|
||
"metadata": {
|
||
"data_source": {
|
||
"permissions_data": [
|
||
{
|
||
"mode": 33188
|
||
}
|
||
],
|
||
"url": "example-docs/fake-html-cp1252.html"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"por",
|
||
"cat",
|
||
"eng",
|
||
"vie"
|
||
]
|
||
},
|
||
"text": "My First Heading",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "82eda2671c5ead903683b67b0f8e3f29",
|
||
"metadata": {
|
||
"data_source": {
|
||
"permissions_data": [
|
||
{
|
||
"mode": 33188
|
||
}
|
||
],
|
||
"url": "example-docs/fake-html-cp1252.html"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"por",
|
||
"cat",
|
||
"eng",
|
||
"vie"
|
||
]
|
||
},
|
||
"text": "My first paragraph.",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "9f76e487d5df3f6c4ce8ea2ece61057f",
|
||
"metadata": {
|
||
"data_source": {
|
||
"permissions_data": [
|
||
{
|
||
"mode": 33188
|
||
}
|
||
],
|
||
"url": "example-docs/fake-html-cp1252.html"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"por",
|
||
"cat",
|
||
"eng",
|
||
"vie"
|
||
]
|
||
},
|
||
"text": "Some CP1252-specific characters:",
|
||
"type": "Title"
|
||
},
|
||
{
|
||
"element_id": "a7394a14aa8bf2dae179420d96ac755c",
|
||
"metadata": {
|
||
"data_source": {
|
||
"permissions_data": [
|
||
{
|
||
"mode": 33188
|
||
}
|
||
],
|
||
"url": "example-docs/fake-html-cp1252.html"
|
||
},
|
||
"filetype": "text/html",
|
||
"languages": [
|
||
"por",
|
||
"cat",
|
||
"eng",
|
||
"vie"
|
||
]
|
||
},
|
||
"text": "¡\t¢\t£\t¤\t¥\t¦\t§\t¨\t©\tª\t«\t¬\tSHY\t®\t¯\n°\t±\t²\t³\t´\tµ\t¶\t·\t¸\t¹\tº\t»\t¼\t½\t¾\t¿\nÀ\tÁ\tÂ\tÃ\tÄ\tÅ\tÆ\tÇ\tÈ\tÉ\tÊ\tË\tÌ\tÍ\tÎ\tÏ\nÐ\tÑ\tÒ\tÓ\tÔ\tÕ\tÖ\t×\tØ\tÙ\tÚ\tÛ\tÜ\tÝ\tÞ\tß\nà\tá\tâ\tã\tä\tå\tæ\tç\tè\té\tê\të\tì\tí\tî\tï\nð\tñ\tò\tó\tô\tõ\tö\t÷\tø\tù\tú\tû\tü\tý\tþ\tÿ",
|
||
"type": "NarrativeText"
|
||
}
|
||
] |