Yuming Long 542d442699
chore CORE-4775: remove html page number metadata field (#2942)
### Summary

Rip off page_number metadata fields until we have page counting for all
kinds of html files (not just limited to news articles with multiple
`<article>` tag)

### Test
Unit tests
`test_add_chunking_strategy_on_partition_html_respects_multipage` and
`test_add_chunking_strategy_title_on_partition_auto_respects_multipage`
removed since they relay on the `page_number` fields from the SEC html
file - now test moved to mock test for chunk_by_title -> revisit those
tests when we find test file for this

Also changed the element ids from partition outputs for html files -
element id change due to page number change (in element id hashing) ->
todo ticket: update other deterministic element id tests per crag's
comment

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
2024-04-30 15:20:26 +00:00

171 lines
5.5 KiB
JSON

[
{
"element_id": "218722ac66e142a570ab2053b430c6c4",
"metadata": {
"data_source": {
"date_modified": "2010-01-23T23:18:40",
"record_locator": {
"file_path": "test.html",
"repo_path": "dcneiner/Downloadify"
},
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Downloadify Example",
"type": "Title"
},
{
"element_id": "bf0fab1925c4b2cbb23a53afce28ebd2",
"metadata": {
"data_source": {
"date_modified": "2010-01-23T23:18:40",
"record_locator": {
"file_path": "test.html",
"repo_path": "dcneiner/Downloadify"
},
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
},
"filetype": "text/html",
"languages": [
"eng"
],
"link_start_indexes": [
27
],
"link_texts": [
"Github Project Page"
],
"link_urls": [
"http://github.com/dcneiner/Downloadify"
]
},
"text": "More info available at the Github Project Page",
"type": "Title"
},
{
"element_id": "d31b62e7e93ed0f7cdebc476a335b3b7",
"metadata": {
"data_source": {
"date_modified": "2010-01-23T23:18:40",
"record_locator": {
"file_path": "test.html",
"repo_path": "dcneiner/Downloadify"
},
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Filename",
"type": "Title"
},
{
"element_id": "3cc4ede1735d2fe4f5e3f7e5aa29f277",
"metadata": {
"data_source": {
"date_modified": "2010-01-23T23:18:40",
"record_locator": {
"file_path": "test.html",
"repo_path": "dcneiner/Downloadify"
},
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "File Contents",
"type": "Title"
},
{
"element_id": "4064323aa25900cf4cb136f665a2aa06",
"metadata": {
"data_source": {
"date_modified": "2010-01-23T23:18:40",
"record_locator": {
"file_path": "test.html",
"repo_path": "dcneiner/Downloadify"
},
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Whatever you put in this text box will be downloaded and saved in the file. If you leave it blank, no file will be downloaded",
"type": "NarrativeText"
},
{
"element_id": "4a6c95c31bb76c1c7f818ca31ea6e0a6",
"metadata": {
"data_source": {
"date_modified": "2010-01-23T23:18:40",
"record_locator": {
"file_path": "test.html",
"repo_path": "dcneiner/Downloadify"
},
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "You must have Flash 10 installed to download this file.",
"type": "NarrativeText"
},
{
"element_id": "6eeead38cfa3d4b7462eb9042aead2c4",
"metadata": {
"data_source": {
"date_modified": "2010-01-23T23:18:40",
"record_locator": {
"file_path": "test.html",
"repo_path": "dcneiner/Downloadify"
},
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Downloadify Invoke Script For This Page",
"type": "Title"
},
{
"element_id": "69aedad22d0d13473663cb65051284a0",
"metadata": {
"data_source": {
"date_modified": "2010-01-23T23:18:40",
"record_locator": {
"file_path": "test.html",
"repo_path": "dcneiner/Downloadify"
},
"url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html",
"version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\""
},
"filetype": "text/html",
"languages": [
"eng"
]
},
"text": "Downloadify.create('downloadify',{\n filename: function(){\n return document.getElementById('filename').value;\n },\n data: function(){ \n return document.getElementById('data').value;\n },\n onComplete: function(){ \n alert('Your File Has Been Saved!'); \n },\n onCancel: function(){ \n alert('You have cancelled the saving of this file.');\n },\n onError: function(){ \n alert('You must put something in the File Contents or there will be nothing to save!'); \n },\n swf: 'media/downloadify.swf',\n downloadImage: 'images/download.png',\n width: 100,\n height: 30,\n transparent: true,\n append: false\n});",
"type": "NarrativeText"
}
]