mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	### Summary Rip off page_number metadata fields until we have page counting for all kinds of html files (not just limited to news articles with multiple `<article>` tag) ### Test Unit tests `test_add_chunking_strategy_on_partition_html_respects_multipage` and `test_add_chunking_strategy_title_on_partition_auto_respects_multipage` removed since they relay on the `page_number` fields from the SEC html file - now test moved to mock test for chunk_by_title -> revisit those tests when we find test file for this Also changed the element ids from partition outputs for html files - element id change due to page number change (in element id hashing) -> todo ticket: update other deterministic element id tests per crag's comment --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
		
			
				
	
	
		
			23 lines
		
	
	
		
			605 B
		
	
	
	
		
			JSON
		
	
	
	
	
	
			
		
		
	
	
			23 lines
		
	
	
		
			605 B
		
	
	
	
		
			JSON
		
	
	
	
	
	
[
 | 
						|
  {
 | 
						|
    "element_id": "273902edca72a67af1614267e617ea06",
 | 
						|
    "metadata": {
 | 
						|
      "data_source": {
 | 
						|
        "date_created": "2023-07-13T14:27:12.821000",
 | 
						|
        "date_modified": "2023-07-13T14:28:03.779000",
 | 
						|
        "record_locator": {
 | 
						|
          "page_id": "2589690",
 | 
						|
          "url": "https://unstructured-ingest-test.atlassian.net"
 | 
						|
        },
 | 
						|
        "url": "https://unstructured-ingest-test.atlassian.net/wiki/rest/api/content/2589690",
 | 
						|
        "version": "1"
 | 
						|
      },
 | 
						|
      "filetype": "text/html",
 | 
						|
      "languages": [
 | 
						|
        "eng"
 | 
						|
      ]
 | 
						|
    },
 | 
						|
    "text": "Test text",
 | 
						|
    "type": "Title"
 | 
						|
  }
 | 
						|
] |