mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 23:20:35 +00:00

### Summary Closes #2011 `languages` was missing from the metadata when partitioning pdfs via `hi_res` and `fast` strategies and missing from image partitions via `hi_res`. This PR adds `languages` to the relevant function calls so it is included in the resulting elements. ### Testing On the main branch, `partition_image` will include `languages` when `strategy='ocr_only'`, but not when `strategy='hi_res'`: ``` filename = "example-docs/english-and-korean.png" from unstructured.partition.image import partition_image elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor']) elements[0].metadata.languages elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor']) elements[0].metadata.languages ``` For `partition_pdf`, `'ocr_only'` will include `languages` in the metadata, but `'fast'` and `'hi_res'` will not. ``` filename = "example-docs/korean-text-with-tables.pdf" from unstructured.partition.pdf import partition_pdf elements = partition_pdf(filename, strategy="ocr_only", languages=['kor']) elements[0].metadata.languages elements = partition_pdf(filename, strategy="fast", languages=['kor']) elements[0].metadata.languages elements = partition_pdf(filename, strategy="hi_res", languages=['kor']) elements[0].metadata.languages ``` On this branch, `languages` is included in the metadata regardless of strategy --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
38 lines
1.2 KiB
Python
Executable File
38 lines
1.2 KiB
Python
Executable File
import click
|
|
from deltalake import DeltaTable
|
|
|
|
|
|
@click.command()
|
|
@click.option("--table-uri", type=str)
|
|
def run_check(table_uri):
|
|
print(f"Checking contents of table at {table_uri}")
|
|
delta_table = DeltaTable(
|
|
table_uri=table_uri,
|
|
)
|
|
|
|
df = delta_table.to_pandas()
|
|
EXPECTED_ROWS = 5
|
|
EXPECTED_COLUMNS = 19
|
|
print(f"Number of rows in table vs expected: {len(df)}/{EXPECTED_ROWS}")
|
|
print(f"Number of columns in table vs expected: {len(df.columns)}/{EXPECTED_COLUMNS}")
|
|
number_of_rows = len(df)
|
|
assert number_of_rows == EXPECTED_ROWS, (
|
|
f"number of rows in generated table ({number_of_rows}) "
|
|
f"doesn't match expected value: {EXPECTED_ROWS}"
|
|
)
|
|
|
|
"""
|
|
The number of columns is associated with the flattened JSON structure of the partition output.
|
|
If this changes, it's most likely due to the metadata changing in the output.
|
|
"""
|
|
number_of_columns = len(df.columns)
|
|
assert number_of_columns == EXPECTED_COLUMNS, (
|
|
f"number of columns in generated table ({number_of_columns}) doesn't "
|
|
f"match expected value: {EXPECTED_COLUMNS}"
|
|
)
|
|
print("table check complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run_check()
|