2023-09-21 14:51:08 -04:00
|
|
|
import click
|
2023-09-15 18:13:39 -04:00
|
|
|
from deltalake import DeltaTable
|
|
|
|
|
|
|
|
|
2023-09-21 14:51:08 -04:00
|
|
|
@click.command()
|
|
|
|
@click.option("--table-uri", type=str)
|
|
|
|
def run_check(table_uri):
|
|
|
|
print(f"Checking contents of table at {table_uri}")
|
2023-09-15 18:13:39 -04:00
|
|
|
delta_table = DeltaTable(
|
2023-09-21 14:51:08 -04:00
|
|
|
table_uri=table_uri,
|
2023-09-15 18:13:39 -04:00
|
|
|
)
|
|
|
|
|
2023-11-03 12:47:21 -04:00
|
|
|
df = delta_table.to_pandas()
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
EXPECTED_ROWS = 5
|
|
|
|
EXPECTED_COLUMNS = 19
|
|
|
|
print(f"Number of rows in table vs expected: {len(df)}/{EXPECTED_ROWS}")
|
|
|
|
print(f"Number of columns in table vs expected: {len(df.columns)}/{EXPECTED_COLUMNS}")
|
2023-11-03 12:47:21 -04:00
|
|
|
number_of_rows = len(df)
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
assert number_of_rows == EXPECTED_ROWS, (
|
2023-11-03 12:47:21 -04:00
|
|
|
f"number of rows in generated table ({number_of_rows}) "
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
f"doesn't match expected value: {EXPECTED_ROWS}"
|
2023-11-03 12:47:21 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
"""
|
|
|
|
The number of columns is associated with the flattened JSON structure of the partition output.
|
|
|
|
If this changes, it's most likely due to the metadata changing in the output.
|
|
|
|
"""
|
|
|
|
number_of_columns = len(df.columns)
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
assert number_of_columns == EXPECTED_COLUMNS, (
|
2023-11-03 12:47:21 -04:00
|
|
|
f"number of columns in generated table ({number_of_columns}) doesn't "
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
f"match expected value: {EXPECTED_COLUMNS}"
|
2023-11-03 08:46:56 -04:00
|
|
|
)
|
2023-09-21 14:51:08 -04:00
|
|
|
print("table check complete")
|
2023-09-15 18:13:39 -04:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
run_check()
|