mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-31 10:03:07 +00:00 
			
		
		
		
	 1ead5a27df
			
		
	
	
		1ead5a27df
		
			
		
	
	
	
	
		
			
			### Summary Closes #2011 `languages` was missing from the metadata when partitioning pdfs via `hi_res` and `fast` strategies and missing from image partitions via `hi_res`. This PR adds `languages` to the relevant function calls so it is included in the resulting elements. ### Testing On the main branch, `partition_image` will include `languages` when `strategy='ocr_only'`, but not when `strategy='hi_res'`: ``` filename = "example-docs/english-and-korean.png" from unstructured.partition.image import partition_image elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor']) elements[0].metadata.languages elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor']) elements[0].metadata.languages ``` For `partition_pdf`, `'ocr_only'` will include `languages` in the metadata, but `'fast'` and `'hi_res'` will not. ``` filename = "example-docs/korean-text-with-tables.pdf" from unstructured.partition.pdf import partition_pdf elements = partition_pdf(filename, strategy="ocr_only", languages=['kor']) elements[0].metadata.languages elements = partition_pdf(filename, strategy="fast", languages=['kor']) elements[0].metadata.languages elements = partition_pdf(filename, strategy="hi_res", languages=['kor']) elements[0].metadata.languages ``` On this branch, `languages` is included in the metadata regardless of strategy --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
		
			
				
	
	
		
			38 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			38 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| import click
 | |
| from deltalake import DeltaTable
 | |
| 
 | |
| 
 | |
| @click.command()
 | |
| @click.option("--table-uri", type=str)
 | |
| def run_check(table_uri):
 | |
|     print(f"Checking contents of table at {table_uri}")
 | |
|     delta_table = DeltaTable(
 | |
|         table_uri=table_uri,
 | |
|     )
 | |
| 
 | |
|     df = delta_table.to_pandas()
 | |
|     EXPECTED_ROWS = 5
 | |
|     EXPECTED_COLUMNS = 19
 | |
|     print(f"Number of rows in table vs expected: {len(df)}/{EXPECTED_ROWS}")
 | |
|     print(f"Number of columns in table vs expected: {len(df.columns)}/{EXPECTED_COLUMNS}")
 | |
|     number_of_rows = len(df)
 | |
|     assert number_of_rows == EXPECTED_ROWS, (
 | |
|         f"number of rows in generated table ({number_of_rows}) "
 | |
|         f"doesn't match expected value: {EXPECTED_ROWS}"
 | |
|     )
 | |
| 
 | |
|     """
 | |
|     The number of columns is associated with the flattened JSON structure of the partition output.
 | |
|     If this changes, it's most likely due to the metadata changing in the output.
 | |
|     """
 | |
|     number_of_columns = len(df.columns)
 | |
|     assert number_of_columns == EXPECTED_COLUMNS, (
 | |
|         f"number of columns in generated table ({number_of_columns}) doesn't "
 | |
|         f"match expected value: {EXPECTED_COLUMNS}"
 | |
|     )
 | |
|     print("table check complete")
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     run_check()
 |