mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 04:08:49 +00:00
Chore[ingest]: adding parameter --partition-pdf-infer-table-structure (#1056)
* add param * expected test * add option (to do doc nit) * test with api for now * typo * test with api key * use local only * encoding -> partition-encoding * changelog and version * Update ingest test fixtures (#1055) Co-authored-by: yuming-long <yuming-long@users.noreply.github.com> * ignore coordinates * no witespace lol * Update ingest test fixtures (#1061) Co-authored-by: yuming-long <yuming-long@users.noreply.github.com> --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
This commit is contained in:
parent
ac7efa19e7
commit
b4fe40e484
@ -1,7 +1,8 @@
|
||||
## 0.9.1-dev10
|
||||
## 0.9.1-dev11
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Adds --partition-pdf-infer-table-structure to unstructured-ingest.
|
||||
* Enable `partition_html` to skip headers and footers with the `skip_headers_and_footers` flag.
|
||||
* Update `partition_doc` and `partition_docx` to track emphasized texts in the output
|
||||
* Adds post processing function `filter_element_types`
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -11,7 +11,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--metadata-exclude filename,file_directory,metadata.data_source.date_processed,metadata.last_modified \
|
||||
--structured-output-dir "$OUTPUT_DIR" \
|
||||
--encoding cp1252 \
|
||||
--partition-encoding cp1252 \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/fake-html-cp1252.html
|
||||
|
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR=$(dirname "$(realpath "$0")")
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
OUTPUT_FOLDER_NAME=local-single-file-with-pdf-infer-table-structure
|
||||
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
||||
|
||||
PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
local \
|
||||
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified \
|
||||
--structured-output-dir "$OUTPUT_DIR" \
|
||||
--partition-pdf-infer-table-structure true \
|
||||
--partition-strategy hi_res \
|
||||
--verbose \
|
||||
--reprocess \
|
||||
--input-path example-docs/layout-parser-paper.pdf
|
||||
|
||||
set +e
|
||||
|
||||
sh "$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
|
@ -30,5 +30,6 @@ export OMP_THREAD_LIMIT=1
|
||||
./test_unstructured_ingest/test-ingest-confluence-large.sh
|
||||
./test_unstructured_ingest/test-ingest-local-single-file.sh
|
||||
./test_unstructured_ingest/test-ingest-local-single-file-with-encoding.sh
|
||||
./test_unstructured_ingest/test-ingest-local-single-file-with-pdf-infer-table-structure.sh
|
||||
# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
|
||||
./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.9.1-dev10" # pragma: no cover
|
||||
__version__ = "0.9.1-dev11" # pragma: no cover
|
||||
|
@ -104,7 +104,8 @@ def map_to_processor_config(options: dict) -> ProcessorConfigs:
|
||||
return ProcessorConfigs(
|
||||
partition_strategy=options["partition_strategy"],
|
||||
partition_ocr_languages=options["partition_ocr_languages"],
|
||||
encoding=options["encoding"],
|
||||
partition_pdf_infer_table_structure=options["partition_pdf_infer_table_structure"],
|
||||
partition_encoding=options["partition_encoding"],
|
||||
num_processes=options["num_processes"],
|
||||
reprocess=options["reprocess"],
|
||||
max_docs=options["max_docs"],
|
||||
@ -200,7 +201,13 @@ def add_shared_options(cmd: Command):
|
||||
"Default: eng",
|
||||
),
|
||||
Option(
|
||||
["--encoding"],
|
||||
["--partition-pdf-infer-table-structure"],
|
||||
default=False,
|
||||
help="If set to True, partition will includ the table's text content in the response."
|
||||
"Default: False",
|
||||
),
|
||||
Option(
|
||||
["--partition-encoding"],
|
||||
default=None,
|
||||
help="Text encoding to use when reading documents. By default the encoding is "
|
||||
"detected automatically.",
|
||||
|
@ -24,7 +24,8 @@ class ProcessorConfigs:
|
||||
|
||||
partition_strategy: str
|
||||
partition_ocr_languages: str
|
||||
encoding: str
|
||||
partition_pdf_infer_table_structure: bool
|
||||
partition_encoding: str
|
||||
num_processes: int
|
||||
reprocess: bool
|
||||
max_docs: int
|
||||
|
@ -97,7 +97,8 @@ def process_documents(
|
||||
process_document,
|
||||
strategy=processor_config.partition_strategy,
|
||||
ocr_languages=processor_config.partition_ocr_languages,
|
||||
encoding=processor_config.encoding,
|
||||
encoding=processor_config.partition_encoding,
|
||||
pdf_infer_table_structure=processor_config.partition_pdf_infer_table_structure,
|
||||
)
|
||||
|
||||
Processor(
|
||||
|
@ -330,8 +330,8 @@ def decide_table_extraction(
|
||||
if doc_type == "pdf":
|
||||
if doc_type in skip_infer_table_types and pdf_infer_table_structure:
|
||||
logger.warning(
|
||||
f"Conflict between variables skip_infer_table_types: {skip_infer_table_types}"
|
||||
f"and pdf_infer_table_structure: {pdf_infer_table_structure},"
|
||||
f"Conflict between variables skip_infer_table_types: {skip_infer_table_types} "
|
||||
f"and pdf_infer_table_structure: {pdf_infer_table_structure}, "
|
||||
"please reset skip_infer_table_types to turn on table extraction for PDFs.",
|
||||
)
|
||||
return not (doc_type in skip_infer_table_types) or pdf_infer_table_structure
|
||||
|
Loading…
x
Reference in New Issue
Block a user