mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-11-04 03:39:31 +00:00 
			
		
		
		
	refactor: isolate PDF converters (#4193)
This commit is contained in:
		
							parent
							
								
									40f772a9b0
								
							
						
					
					
						commit
						1e4ef24ae9
					
				@ -9,6 +9,7 @@ from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
 | 
			
		||||
from haystack.nodes.file_converter.txt import TextConverter
 | 
			
		||||
from haystack.nodes.file_converter.azure import AzureConverter
 | 
			
		||||
from haystack.nodes.file_converter.parsr import ParsrConverter
 | 
			
		||||
from haystack.nodes.file_converter.pdf import PDFToTextConverter
 | 
			
		||||
 | 
			
		||||
MarkdownConverter = safe_import(
 | 
			
		||||
    "haystack.nodes.file_converter.markdown", "MarkdownConverter", "preprocessing"
 | 
			
		||||
@ -16,9 +17,6 @@ MarkdownConverter = safe_import(
 | 
			
		||||
ImageToTextConverter = safe_import(
 | 
			
		||||
    "haystack.nodes.file_converter.image", "ImageToTextConverter", "ocr"
 | 
			
		||||
)  # Has optional dependencies
 | 
			
		||||
PDFToTextConverter = safe_import(
 | 
			
		||||
    "haystack.nodes.file_converter.pdf", "PDFToTextConverter", "ocr"
 | 
			
		||||
)  # Has optional dependencies
 | 
			
		||||
PDFToTextOCRConverter = safe_import(
 | 
			
		||||
    "haystack.nodes.file_converter.pdf", "PDFToTextOCRConverter", "ocr"
 | 
			
		||||
    "haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr"
 | 
			
		||||
)  # Has optional dependencies
 | 
			
		||||
 | 
			
		||||
@ -1,18 +1,9 @@
 | 
			
		||||
import logging
 | 
			
		||||
import subprocess
 | 
			
		||||
import tempfile
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Any, Dict, List, Optional
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    from pdf2image import convert_from_path
 | 
			
		||||
except (ImportError, ModuleNotFoundError) as ie:
 | 
			
		||||
    from haystack.utils.import_utils import _optional_component_not_installed
 | 
			
		||||
 | 
			
		||||
    _optional_component_not_installed(__name__, "ocr", ie)
 | 
			
		||||
 | 
			
		||||
from haystack.nodes.file_converter.base import BaseConverter
 | 
			
		||||
from haystack.nodes.file_converter.image import ImageToTextConverter
 | 
			
		||||
from haystack.schema import Document
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
@ -52,7 +43,9 @@ class PDFToTextConverter(BaseConverter):
 | 
			
		||||
            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
 | 
			
		||||
        )
 | 
			
		||||
        try:
 | 
			
		||||
            subprocess.run(["pdftotext", "-v"], shell=False, check=False)
 | 
			
		||||
            subprocess.run(
 | 
			
		||||
                ["pdftotext", "-v"], shell=False, check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
 | 
			
		||||
            )
 | 
			
		||||
        except FileNotFoundError:
 | 
			
		||||
            raise FileNotFoundError(
 | 
			
		||||
                """pdftotext is not installed. It is part of xpdf or poppler-utils software suite.
 | 
			
		||||
@ -202,94 +195,3 @@ class PDFToTextConverter(BaseConverter):
 | 
			
		||||
        pages = pages[:-1]  # the last page in the split is always empty.
 | 
			
		||||
 | 
			
		||||
        return pages
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PDFToTextOCRConverter(BaseConverter):
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        remove_numeric_tables: bool = False,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
        id_hash_keys: Optional[List[str]] = None,
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
 | 
			
		||||
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
                                      does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
                                      may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
                                      The rows containing strings are thus retained in this option.
 | 
			
		||||
        :param valid_languages: validate languages from a list of languages supported by tessarect
 | 
			
		||||
                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
 | 
			
		||||
                                This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
                                not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
                                in garbled text. If no value is provided, English will be set as default.
 | 
			
		||||
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
 | 
			
		||||
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 | 
			
		||||
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
 | 
			
		||||
            In this case the id will be generated by using the content and the defined metadata.
 | 
			
		||||
        """
 | 
			
		||||
        if valid_languages is None:
 | 
			
		||||
            valid_languages = ["eng"]
 | 
			
		||||
        # init image to text instance
 | 
			
		||||
        self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)
 | 
			
		||||
 | 
			
		||||
        super().__init__(
 | 
			
		||||
            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def convert(
 | 
			
		||||
        self,
 | 
			
		||||
        file_path: Path,
 | 
			
		||||
        meta: Optional[Dict[str, Any]] = None,
 | 
			
		||||
        remove_numeric_tables: Optional[bool] = None,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
        encoding: Optional[str] = None,
 | 
			
		||||
        id_hash_keys: Optional[List[str]] = None,
 | 
			
		||||
        start_page: Optional[int] = None,
 | 
			
		||||
        end_page: Optional[int] = None,
 | 
			
		||||
    ) -> List[Document]:
 | 
			
		||||
        """
 | 
			
		||||
        Convert a file to a dictionary containing the text and any associated meta data.
 | 
			
		||||
 | 
			
		||||
        File converters may extract file meta like name or size. In addition to it, user
 | 
			
		||||
        supplied meta data like author, url, external IDs can be supplied as a dictionary.
 | 
			
		||||
 | 
			
		||||
        :param file_path: path of the file to convert
 | 
			
		||||
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
                                      does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
                                      may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
                                      The rows containing strings are thus retained in this option.
 | 
			
		||||
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
                                This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
                                not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
                                in garbled text.
 | 
			
		||||
        :param encoding: Not applicable
 | 
			
		||||
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
 | 
			
		||||
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 | 
			
		||||
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
 | 
			
		||||
            In this case the id will be generated by using the content and the defined metadata.
 | 
			
		||||
        :param start_page: The page number where to start the conversion
 | 
			
		||||
        :param end_page: The page number where to end the conversion.
 | 
			
		||||
        """
 | 
			
		||||
        if id_hash_keys is None:
 | 
			
		||||
            id_hash_keys = self.id_hash_keys
 | 
			
		||||
 | 
			
		||||
        start_page = start_page or 1
 | 
			
		||||
 | 
			
		||||
        pages = []
 | 
			
		||||
        try:
 | 
			
		||||
            images = convert_from_path(file_path, first_page=start_page, last_page=end_page)
 | 
			
		||||
            for image in images:
 | 
			
		||||
                temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg")
 | 
			
		||||
                image.save(temp_img.name)
 | 
			
		||||
                pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content)
 | 
			
		||||
        except Exception as exception:
 | 
			
		||||
            logger.error("File %s has an error:\n%s", file_path, exception)
 | 
			
		||||
 | 
			
		||||
        raw_text = "\f" * (start_page - 1) + "\f".join(pages)  # tracking skipped pages for correct page numbering
 | 
			
		||||
        document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys)
 | 
			
		||||
        return [document]
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										108
									
								
								haystack/nodes/file_converter/pdf_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								haystack/nodes/file_converter/pdf_ocr.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,108 @@
 | 
			
		||||
import logging
 | 
			
		||||
import tempfile
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Any, Dict, List, Optional
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    from pdf2image import convert_from_path
 | 
			
		||||
except (ImportError, ModuleNotFoundError) as ie:
 | 
			
		||||
    from haystack.utils.import_utils import _optional_component_not_installed
 | 
			
		||||
 | 
			
		||||
    _optional_component_not_installed(__name__, "ocr", ie)
 | 
			
		||||
 | 
			
		||||
from haystack.nodes.file_converter.base import BaseConverter
 | 
			
		||||
from haystack.nodes.file_converter.image import ImageToTextConverter
 | 
			
		||||
from haystack.schema import Document
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PDFToTextOCRConverter(BaseConverter):
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        remove_numeric_tables: bool = False,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
        id_hash_keys: Optional[List[str]] = None,
 | 
			
		||||
    ):
 | 
			
		||||
        """
 | 
			
		||||
        Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)
 | 
			
		||||
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
                                      does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
                                      may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
                                      The rows containing strings are thus retained in this option.
 | 
			
		||||
        :param valid_languages: validate languages from a list of languages supported by tessarect
 | 
			
		||||
                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
 | 
			
		||||
                                This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
                                not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
                                in garbled text. If no value is provided, English will be set as default.
 | 
			
		||||
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
 | 
			
		||||
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 | 
			
		||||
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
 | 
			
		||||
            In this case the id will be generated by using the content and the defined metadata.
 | 
			
		||||
        """
 | 
			
		||||
        if valid_languages is None:
 | 
			
		||||
            valid_languages = ["eng"]
 | 
			
		||||
        # init image to text instance
 | 
			
		||||
        self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)
 | 
			
		||||
 | 
			
		||||
        super().__init__(
 | 
			
		||||
            remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def convert(
 | 
			
		||||
        self,
 | 
			
		||||
        file_path: Path,
 | 
			
		||||
        meta: Optional[Dict[str, Any]] = None,
 | 
			
		||||
        remove_numeric_tables: Optional[bool] = None,
 | 
			
		||||
        valid_languages: Optional[List[str]] = None,
 | 
			
		||||
        encoding: Optional[str] = None,
 | 
			
		||||
        id_hash_keys: Optional[List[str]] = None,
 | 
			
		||||
        start_page: Optional[int] = None,
 | 
			
		||||
        end_page: Optional[int] = None,
 | 
			
		||||
    ) -> List[Document]:
 | 
			
		||||
        """
 | 
			
		||||
        Convert a file to a dictionary containing the text and any associated meta data.
 | 
			
		||||
 | 
			
		||||
        File converters may extract file meta like name or size. In addition to it, user
 | 
			
		||||
        supplied meta data like author, url, external IDs can be supplied as a dictionary.
 | 
			
		||||
 | 
			
		||||
        :param file_path: path of the file to convert
 | 
			
		||||
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
 | 
			
		||||
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
 | 
			
		||||
                                      The tabular structures in documents might be noise for the reader model if it
 | 
			
		||||
                                      does not have table parsing capability for finding answers. However, tables
 | 
			
		||||
                                      may also have long strings that could possible candidate for searching answers.
 | 
			
		||||
                                      The rows containing strings are thus retained in this option.
 | 
			
		||||
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
 | 
			
		||||
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
 | 
			
		||||
                                This option can be used to add test for encoding errors. If the extracted text is
 | 
			
		||||
                                not one of the valid languages, then it might likely be encoding error resulting
 | 
			
		||||
                                in garbled text.
 | 
			
		||||
        :param encoding: Not applicable
 | 
			
		||||
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
 | 
			
		||||
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
 | 
			
		||||
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
 | 
			
		||||
            In this case the id will be generated by using the content and the defined metadata.
 | 
			
		||||
        :param start_page: The page number where to start the conversion
 | 
			
		||||
        :param end_page: The page number where to end the conversion.
 | 
			
		||||
        """
 | 
			
		||||
        if id_hash_keys is None:
 | 
			
		||||
            id_hash_keys = self.id_hash_keys
 | 
			
		||||
 | 
			
		||||
        start_page = start_page or 1
 | 
			
		||||
 | 
			
		||||
        pages = []
 | 
			
		||||
        try:
 | 
			
		||||
            images = convert_from_path(file_path, first_page=start_page, last_page=end_page)
 | 
			
		||||
            for image in images:
 | 
			
		||||
                temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg")
 | 
			
		||||
                image.save(temp_img.name)
 | 
			
		||||
                pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content)
 | 
			
		||||
        except Exception as exception:
 | 
			
		||||
            logger.error("File %s has an error:\n%s", file_path, exception)
 | 
			
		||||
 | 
			
		||||
        raw_text = "\f" * (start_page - 1) + "\f".join(pages)  # tracking skipped pages for correct page numbering
 | 
			
		||||
        document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys)
 | 
			
		||||
        return [document]
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user