mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-28 07:33:36 +00:00
### Summary Rip off page_number metadata fields until we have page counting for all kinds of html files (not just limited to news articles with multiple `<article>` tag) ### Test Unit tests `test_add_chunking_strategy_on_partition_html_respects_multipage` and `test_add_chunking_strategy_title_on_partition_auto_respects_multipage` removed since they relay on the `page_number` fields from the SEC html file - now test moved to mock test for chunk_by_title -> revisit those tests when we find test file for this Also changed the element ids from partition outputs for html files - element id change due to page number change (in element id hashing) -> todo ticket: update other deterministic element id tests per crag's comment --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
171 lines
5.5 KiB
JSON
171 lines
5.5 KiB
JSON
[
|
|
{
|
|
"element_id": "218722ac66e142a570ab2053b430c6c4",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_modified": "2010-01-23T23:18:40",
|
|
"record_locator": {
|
|
"file_path": "test.html",
|
|
"repo_path": "dcneiner/Downloadify"
|
|
},
|
|
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
|
|
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "Downloadify Example",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "bf0fab1925c4b2cbb23a53afce28ebd2",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_modified": "2010-01-23T23:18:40",
|
|
"record_locator": {
|
|
"file_path": "test.html",
|
|
"repo_path": "dcneiner/Downloadify"
|
|
},
|
|
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
|
|
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
],
|
|
"link_start_indexes": [
|
|
27
|
|
],
|
|
"link_texts": [
|
|
"Github Project Page"
|
|
],
|
|
"link_urls": [
|
|
"http://github.com/dcneiner/Downloadify"
|
|
]
|
|
},
|
|
"text": "More info available at the Github Project Page",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "d31b62e7e93ed0f7cdebc476a335b3b7",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_modified": "2010-01-23T23:18:40",
|
|
"record_locator": {
|
|
"file_path": "test.html",
|
|
"repo_path": "dcneiner/Downloadify"
|
|
},
|
|
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
|
|
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "Filename",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "3cc4ede1735d2fe4f5e3f7e5aa29f277",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_modified": "2010-01-23T23:18:40",
|
|
"record_locator": {
|
|
"file_path": "test.html",
|
|
"repo_path": "dcneiner/Downloadify"
|
|
},
|
|
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
|
|
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "File Contents",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "4064323aa25900cf4cb136f665a2aa06",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_modified": "2010-01-23T23:18:40",
|
|
"record_locator": {
|
|
"file_path": "test.html",
|
|
"repo_path": "dcneiner/Downloadify"
|
|
},
|
|
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
|
|
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "Whatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
|
|
"type": "NarrativeText"
|
|
},
|
|
{
|
|
"element_id": "4a6c95c31bb76c1c7f818ca31ea6e0a6",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_modified": "2010-01-23T23:18:40",
|
|
"record_locator": {
|
|
"file_path": "test.html",
|
|
"repo_path": "dcneiner/Downloadify"
|
|
},
|
|
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
|
|
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "You must have Flash 10 installed to download this file.",
|
|
"type": "NarrativeText"
|
|
},
|
|
{
|
|
"element_id": "6eeead38cfa3d4b7462eb9042aead2c4",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_modified": "2010-01-23T23:18:40",
|
|
"record_locator": {
|
|
"file_path": "test.html",
|
|
"repo_path": "dcneiner/Downloadify"
|
|
},
|
|
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
|
|
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "Downloadify Invoke Script For This Page",
|
|
"type": "Title"
|
|
},
|
|
{
|
|
"element_id": "69aedad22d0d13473663cb65051284a0",
|
|
"metadata": {
|
|
"data_source": {
|
|
"date_modified": "2010-01-23T23:18:40",
|
|
"record_locator": {
|
|
"file_path": "test.html",
|
|
"repo_path": "dcneiner/Downloadify"
|
|
},
|
|
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
|
|
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
|
|
},
|
|
"filetype": "text/html",
|
|
"languages": [
|
|
"eng"
|
|
]
|
|
},
|
|
"text": "Downloadify.create('downloadify',{\n filename: function(){\n return document.getElementById('filename').value;\n },\n data: function(){ \n return document.getElementById('data').value;\n },\n onComplete: function(){ \n alert('Your File Has Been Saved!'); \n },\n onCancel: function(){ \n alert('You have cancelled the saving of this file.');\n },\n onError: function(){ \n alert('You must put something in the File Contents or there will be nothing to save!'); \n },\n swf: 'media/downloadify.swf',\n downloadImage: 'images/download.png',\n width: 100,\n height: 30,\n transparent: true,\n append: false\n});",
|
|
"type": "NarrativeText"
|
|
}
|
|
] |