diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py index e82bb4292..00d46d06f 100644 --- a/unstructured/partition/pdf_image/pdfminer_processing.py +++ b/unstructured/partition/pdf_image/pdfminer_processing.py @@ -87,7 +87,7 @@ def process_data_with_pdfminer( x2 * coef, y2 * coef, text=_text, - source="pdftext", + source=Source.PDFTEXT, ) if text_region.bbox is not None and text_region.bbox.area > 0: @@ -104,7 +104,7 @@ def process_data_with_pdfminer( x2 * coef, y2 * coef, text=None, - source="pdftext", + source=Source.PDFTEXT, ) if image_region.bbox is not None and image_region.bbox.area > 0: layout.append(image_region) diff --git a/unstructured/partition/utils/constants.py b/unstructured/partition/utils/constants.py index c1864a9e5..374a5dda7 100644 --- a/unstructured/partition/utils/constants.py +++ b/unstructured/partition/utils/constants.py @@ -3,6 +3,7 @@ from enum import Enum class Source(Enum): + PDFTEXT = "pdftext" PDFMINER = "pdfminer" OCR_TESSERACT = "ocr_tesseract" OCR_PADDLE = "ocr_paddle"