From d2f62abe56a4dfbb07bbef42836c5eb0c7713c58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= Date: Wed, 8 May 2024 17:43:24 +0200 Subject: [PATCH] Fix --- unstructured/partition/pdf_image/pdfminer_processing.py | 4 ++-- unstructured/partition/utils/constants.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index e82bb4292..00d46d06f 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -87,7 +87,7 @@ def process_data_with_pdfminer( x2 * coef, y2 * coef, text=_text, - source="pdftext", + source=Source.PDFTEXT, ) if text_region.bbox is not None and text_region.bbox.area > 0: @@ -104,7 +104,7 @@ def process_data_with_pdfminer( x2 * coef, y2 * coef, text=None, - source="pdftext", + source=Source.PDFTEXT, ) if image_region.bbox is not None and image_region.bbox.area > 0: layout.append(image_region) diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index c1864a9e5..374a5dda7 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -3,6 +3,7 @@ from enum import Enum class Source(Enum): + PDFTEXT = "pdftext" PDFMINER = "pdfminer" OCR_TESSERACT = "ocr_tesseract" OCR_PADDLE = "ocr_paddle"