mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 20:27:37 +00:00
Chore[ingest]: adding parameter --partition-pdf-infer-table-structure (#1056)
* add param * expected test * add option (to do doc nit) * test with api for now * typo * test with api key * use local only * encoding -> partition-encoding * changelog and version * Update ingest test fixtures (#1055) Co-authored-by: yuming-long <yuming-long@users.noreply.github.com> * ignore coordinates * no witespace lol * Update ingest test fixtures (#1061) Co-authored-by: yuming-long <yuming-long@users.noreply.github.com> --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
This commit is contained in:
parent
ac7efa19e7
commit
b4fe40e484
@ -1,7 +1,8 @@
|
|||||||
## 0.9.1-dev10
|
## 0.9.1-dev11
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
|
* Adds --partition-pdf-infer-table-structure to unstructured-ingest.
|
||||||
* Enable `partition_html` to skip headers and footers with the `skip_headers_and_footers` flag.
|
* Enable `partition_html` to skip headers and footers with the `skip_headers_and_footers` flag.
|
||||||
* Update `partition_doc` and `partition_docx` to track emphasized texts in the output
|
* Update `partition_doc` and `partition_docx` to track emphasized texts in the output
|
||||||
* Adds post processing function `filter_element_types`
|
* Adds post processing function `filter_element_types`
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -11,7 +11,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
|||||||
local \
|
local \
|
||||||
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified \
|
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified \
|
||||||
--structured-output-dir "$OUTPUT_DIR" \
|
--structured-output-dir "$OUTPUT_DIR" \
|
||||||
--encoding cp1252 \
|
--partition-encoding cp1252 \
|
||||||
--verbose \
|
--verbose \
|
||||||
--reprocess \
|
--reprocess \
|
||||||
--input-path example-docs/fake-html-cp1252.html
|
--input-path example-docs/fake-html-cp1252.html
|
||||||
|
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||||
|
cd "$SCRIPT_DIR"/.. || exit 1
|
||||||
|
OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure
|
||||||
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||||
|
|
||||||
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||||
|
local \
|
||||||
|
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified \
|
||||||
|
--structured-output-dir "$OUTPUT_DIR" \
|
||||||
|
--partition-pdf-infer-table-structure true \
|
||||||
|
--partition-strategy hi_res \
|
||||||
|
--verbose \
|
||||||
|
--reprocess \
|
||||||
|
--input-path example-docs/layout-parser-paper.pdf
|
||||||
|
|
||||||
|
set +e
|
||||||
|
|
||||||
|
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
@ -30,5 +30,6 @@ export OMP_THREAD_LIMIT=1
|
|||||||
./test_unstructured_ingest/test-ingest-confluence-large.sh
|
./test_unstructured_ingest/test-ingest-confluence-large.sh
|
||||||
./test_unstructured_ingest/test-ingest-local-single-file.sh
|
./test_unstructured_ingest/test-ingest-local-single-file.sh
|
||||||
./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh
|
./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh
|
||||||
|
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh
|
||||||
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
||||||
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
|
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.9.1-dev10" # pragma: no cover
|
__version__ = "0.9.1-dev11" # pragma: no cover
|
||||||
|
@ -104,7 +104,8 @@ def map_to_processor_config(options: dict) -> ProcessorConfigs:
|
|||||||
return ProcessorConfigs(
|
return ProcessorConfigs(
|
||||||
partition_strategy=options["partition_strategy"],
|
partition_strategy=options["partition_strategy"],
|
||||||
partition_ocr_languages=options["partition_ocr_languages"],
|
partition_ocr_languages=options["partition_ocr_languages"],
|
||||||
encoding=options["encoding"],
|
partition_pdf_infer_table_structure=options["partition_pdf_infer_table_structure"],
|
||||||
|
partition_encoding=options["partition_encoding"],
|
||||||
num_processes=options["num_processes"],
|
num_processes=options["num_processes"],
|
||||||
reprocess=options["reprocess"],
|
reprocess=options["reprocess"],
|
||||||
max_docs=options["max_docs"],
|
max_docs=options["max_docs"],
|
||||||
@ -200,7 +201,13 @@ def add_shared_options(cmd: Command):
|
|||||||
"Default: eng",
|
"Default: eng",
|
||||||
),
|
),
|
||||||
Option(
|
Option(
|
||||||
["--encoding"],
|
["--partition-pdf-infer-table-structure"],
|
||||||
|
default=False,
|
||||||
|
help="If set to True, partition will includ the table's text content in the response."
|
||||||
|
"Default: False",
|
||||||
|
),
|
||||||
|
Option(
|
||||||
|
["--partition-encoding"],
|
||||||
default=None,
|
default=None,
|
||||||
help="Text encoding to use when reading documents. By default the encoding is "
|
help="Text encoding to use when reading documents. By default the encoding is "
|
||||||
"detected automatically.",
|
"detected automatically.",
|
||||||
|
@ -24,7 +24,8 @@ class ProcessorConfigs:
|
|||||||
|
|
||||||
partition_strategy: str
|
partition_strategy: str
|
||||||
partition_ocr_languages: str
|
partition_ocr_languages: str
|
||||||
encoding: str
|
partition_pdf_infer_table_structure: bool
|
||||||
|
partition_encoding: str
|
||||||
num_processes: int
|
num_processes: int
|
||||||
reprocess: bool
|
reprocess: bool
|
||||||
max_docs: int
|
max_docs: int
|
||||||
|
@ -97,7 +97,8 @@ def process_documents(
|
|||||||
process_document,
|
process_document,
|
||||||
strategy=processor_config.partition_strategy,
|
strategy=processor_config.partition_strategy,
|
||||||
ocr_languages=processor_config.partition_ocr_languages,
|
ocr_languages=processor_config.partition_ocr_languages,
|
||||||
encoding=processor_config.encoding,
|
encoding=processor_config.partition_encoding,
|
||||||
|
pdf_infer_table_structure=processor_config.partition_pdf_infer_table_structure,
|
||||||
)
|
)
|
||||||
|
|
||||||
Processor(
|
Processor(
|
||||||
|
@ -330,8 +330,8 @@ def decide_table_extraction(
|
|||||||
if doc_type == "pdf":
|
if doc_type == "pdf":
|
||||||
if doc_type in skip_infer_table_types and pdf_infer_table_structure:
|
if doc_type in skip_infer_table_types and pdf_infer_table_structure:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Conflict between variables skip_infer_table_types: {skip_infer_table_types}"
|
f"Conflict between variables skip_infer_table_types: {skip_infer_table_types} "
|
||||||
f"and pdf_infer_table_structure: {pdf_infer_table_structure},"
|
f"and pdf_infer_table_structure: {pdf_infer_table_structure}, "
|
||||||
"please reset skip_infer_table_types to turn on table extraction for PDFs.",
|
"please reset skip_infer_table_types to turn on table extraction for PDFs.",
|
||||||
)
|
)
|
||||||
return not (doc_type in skip_infer_table_types) or pdf_infer_table_structure
|
return not (doc_type in skip_infer_table_types) or pdf_infer_table_structure
|
||||||
|
Loading…
x
Reference in New Issue
Block a user