mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 18:14:51 +00:00 
			
		
		
		
	 ee9be2a3b2
			
		
	
	
		ee9be2a3b2
		
			
		
	
	
	
	
		
			
			Addresses a cluster of HTML-related bugs: - empty table is identified as bulleted-table - `partition_html()` emits empty (no text) tables (#1928) - `.text_as_html` contains inappropriate `<br>` elements in invalid locations. - cells enclosed in `<thead>` and `<tfoot>` elements are dropped (#1928) - `.text_as_html` contains whitespace padding Each of these is addressed in a separate commit below. Fixes #1928. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: scanny <scanny@users.noreply.github.com> Co-authored-by: Yuming Long <63475068+yuming-long@users.noreply.github.com>
		
			
				
	
	
		
			307 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			JSON
		
	
	
	
	
	
			
		
		
	
	
			307 lines
		
	
	
		
			8.4 KiB
		
	
	
	
		
			JSON
		
	
	
	
	
	
| [
 | |
|   {
 | |
|     "element_id": "307afee17dac4c598e361c095338decd",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "Copy and paste this section for each week.",
 | |
|     "type": "Title"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "b980a145c5e8c9e233a0643366ba520a",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "emphasized_text_contents": [
 | |
|         "Win"
 | |
|       ],
 | |
|       "emphasized_text_tags": [
 | |
|         "strong"
 | |
|       ],
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "Win",
 | |
|     "type": "Title"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "",
 | |
|     "type": "ListItem"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "",
 | |
|     "type": "ListItem"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "",
 | |
|     "type": "ListItem"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "aecc044c7725a6555114285dc28fe2d1",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "emphasized_text_contents": [
 | |
|         "Needs input"
 | |
|       ],
 | |
|       "emphasized_text_tags": [
 | |
|         "strong"
 | |
|       ],
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "Needs input",
 | |
|     "type": "Title"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "",
 | |
|     "type": "ListItem"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "",
 | |
|     "type": "ListItem"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "",
 | |
|     "type": "ListItem"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "9d3cab2b5efed4eaef42a707dbc813da",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "emphasized_text_contents": [
 | |
|         "Focus"
 | |
|       ],
 | |
|       "emphasized_text_tags": [
 | |
|         "strong"
 | |
|       ],
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "Focus",
 | |
|     "type": "Title"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "",
 | |
|     "type": "ListItem"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "e3b0c44298fc1c149afbf4c8996fb924",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1
 | |
|     },
 | |
|     "text": "",
 | |
|     "type": "ListItem"
 | |
|   },
 | |
|   {
 | |
|     "element_id": "a240e43c0ae70731c65ae5430d2dab7f",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-09T12:54:45.226000",
 | |
|         "date_modified": "2023-07-09T12:54:45.226000",
 | |
|         "record_locator": {
 | |
|           "page_id": "1605942",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/1605942",
 | |
|         "version": "1"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ],
 | |
|       "page_number": 1,
 | |
|       "text_as_html": "<table><tr><td>Notes</td></tr><tr><td>Important Links</td></tr></table>"
 | |
|     },
 | |
|     "text": "Notes Important Links",
 | |
|     "type": "Table"
 | |
|   }
 | |
| ] |