Chore[ingest]: adding parameter --partition-pdf-infer-table-structure (#1056)

* add param

* expected test

* add option (to do doc nit)

* test with api for now

* typo

* test with api key

* use local only

* encoding -> partition-encoding

* changelog and version

* Update ingest test fixtures (#1055)

Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>

* ignore coordinates

* no witespace lol

* Update ingest test fixtures (#1061)

Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>

---------

Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
This commit is contained in:
Yuming Long 2023-08-08 18:11:06 -04:00 committed by GitHub
parent ac7efa19e7
commit b4fe40e484
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 2336 additions and 9 deletions

View File

@ -1,7 +1,8 @@
## 0.9.1-dev10
## 0.9.1-dev11
### Enhancements
* Adds --partition-pdf-infer-table-structure to unstructured-ingest.
* Enable `partition_html` to skip headers and footers with the `skip_headers_and_footers` flag.
* Update `partition_doc` and `partition_docx` to track emphasized texts in the output
* Adds post processing function `filter_element_types`

View File

@ -11,7 +11,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified \
--structured-output-dir "$OUTPUT_DIR" \
--encoding cp1252 \
--partition-encoding cp1252 \
--verbose \
--reprocess \
--input-path example-docs/fake-html-cp1252.html

View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -e
SCRIPT_DIR=$(dirname "$(realpath "$0")")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
PYTHONPATH=. ./unstructured/ingest/main.py \
local \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified \
--structured-output-dir "$OUTPUT_DIR" \
--partition-pdf-infer-table-structure true \
--partition-strategy hi_res \
--verbose \
--reprocess \
--input-path example-docs/layout-parser-paper.pdf
set +e
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME

View File

@ -30,5 +30,6 @@ export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-confluence-large.sh
./test_unstructured_ingest/test-ingest-local-single-file.sh
./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh

View File

@ -1 +1 @@
__version__ = "0.9.1-dev10" # pragma: no cover
__version__ = "0.9.1-dev11" # pragma: no cover

View File

@ -104,7 +104,8 @@ def map_to_processor_config(options: dict) -> ProcessorConfigs:
return ProcessorConfigs(
partition_strategy=options["partition_strategy"],
partition_ocr_languages=options["partition_ocr_languages"],
encoding=options["encoding"],
partition_pdf_infer_table_structure=options["partition_pdf_infer_table_structure"],
partition_encoding=options["partition_encoding"],
num_processes=options["num_processes"],
reprocess=options["reprocess"],
max_docs=options["max_docs"],
@ -200,7 +201,13 @@ def add_shared_options(cmd: Command):
"Default: eng",
),
Option(
["--encoding"],
["--partition-pdf-infer-table-structure"],
default=False,
help="If set to True, partition will includ the table's text content in the response."
"Default: False",
),
Option(
["--partition-encoding"],
default=None,
help="Text encoding to use when reading documents. By default the encoding is "
"detected automatically.",

View File

@ -24,7 +24,8 @@ class ProcessorConfigs:
partition_strategy: str
partition_ocr_languages: str
encoding: str
partition_pdf_infer_table_structure: bool
partition_encoding: str
num_processes: int
reprocess: bool
max_docs: int

View File

@ -97,7 +97,8 @@ def process_documents(
process_document,
strategy=processor_config.partition_strategy,
ocr_languages=processor_config.partition_ocr_languages,
encoding=processor_config.encoding,
encoding=processor_config.partition_encoding,
pdf_infer_table_structure=processor_config.partition_pdf_infer_table_structure,
)
Processor(

View File

@ -330,8 +330,8 @@ def decide_table_extraction(
if doc_type == "pdf":
if doc_type in skip_infer_table_types and pdf_infer_table_structure:
logger.warning(
f"Conflict between variables skip_infer_table_types: {skip_infer_table_types}"
f"and pdf_infer_table_structure: {pdf_infer_table_structure},"
f"Conflict between variables skip_infer_table_types: {skip_infer_table_types} "
f"and pdf_infer_table_structure: {pdf_infer_table_structure}, "
"please reset skip_infer_table_types to turn on table extraction for PDFs.",
)
return not (doc_type in skip_infer_table_types) or pdf_infer_table_structure