mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 18:14:51 +00:00 
			
		
		
		
	 542d442699
			
		
	
	
		542d442699
		
			
		
	
	
	
	
		
			
			### Summary Rip off page_number metadata fields until we have page counting for all kinds of html files (not just limited to news articles with multiple `<article>` tag) ### Test Unit tests `test_add_chunking_strategy_on_partition_html_respects_multipage` and `test_add_chunking_strategy_title_on_partition_auto_respects_multipage` removed since they relay on the `page_number` fields from the SEC html file - now test moved to mock test for chunk_by_title -> revisit those tests when we find test file for this Also changed the element ids from partition outputs for html files - element id change due to page number change (in element id hashing) -> todo ticket: update other deterministic element id tests per crag's comment --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
		
			
				
	
	
		
			23 lines
		
	
	
		
			605 B
		
	
	
	
		
			JSON
		
	
	
	
	
	
			
		
		
	
	
			23 lines
		
	
	
		
			605 B
		
	
	
	
		
			JSON
		
	
	
	
	
	
| [
 | |
|   {
 | |
|     "element_id": "9c477a45b1db458b12a2a350ecb57a36",
 | |
|     "metadata": {
 | |
|       "data_source": {
 | |
|         "date_created": "2023-07-13T14:28:06.310000",
 | |
|         "date_modified": "2023-07-14T22:16:58.907000",
 | |
|         "record_locator": {
 | |
|           "page_id": "2589704",
 | |
|           "url": "https://unstructured-ingest-test.atlassian.net"
 | |
|         },
 | |
|         "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/2589704",
 | |
|         "version": "3"
 | |
|       },
 | |
|       "filetype": "text/html",
 | |
|       "languages": [
 | |
|         "eng"
 | |
|       ]
 | |
|     },
 | |
|     "text": "Test text",
 | |
|     "type": "Title"
 | |
|   }
 | |
| ] |