mirror of
				https://github.com/deepset-ai/haystack.git
				synced 2025-10-31 17:59:27 +00:00 
			
		
		
		
	refactor: isolate PDF converters (#4193)
This commit is contained in:
		
							parent
							
								
									40f772a9b0
								
							
						
					
					
						commit
						1e4ef24ae9
					
				| @ -9,6 +9,7 @@ from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser | ||||
| from haystack.nodes.file_converter.txt import TextConverter | ||||
| from haystack.nodes.file_converter.azure import AzureConverter | ||||
| from haystack.nodes.file_converter.parsr import ParsrConverter | ||||
| from haystack.nodes.file_converter.pdf import PDFToTextConverter | ||||
| 
 | ||||
| MarkdownConverter = safe_import( | ||||
|     "haystack.nodes.file_converter.markdown", "MarkdownConverter", "preprocessing" | ||||
| @ -16,9 +17,6 @@ MarkdownConverter = safe_import( | ||||
| ImageToTextConverter = safe_import( | ||||
|     "haystack.nodes.file_converter.image", "ImageToTextConverter", "ocr" | ||||
| )  # Has optional dependencies | ||||
| PDFToTextConverter = safe_import( | ||||
|     "haystack.nodes.file_converter.pdf", "PDFToTextConverter", "ocr" | ||||
| )  # Has optional dependencies | ||||
| PDFToTextOCRConverter = safe_import( | ||||
|     "haystack.nodes.file_converter.pdf", "PDFToTextOCRConverter", "ocr" | ||||
|     "haystack.nodes.file_converter.pdf_ocr", "PDFToTextOCRConverter", "ocr" | ||||
| )  # Has optional dependencies | ||||
|  | ||||
| @ -1,18 +1,9 @@ | ||||
| import logging | ||||
| import subprocess | ||||
| import tempfile | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, List, Optional | ||||
| 
 | ||||
| try: | ||||
|     from pdf2image import convert_from_path | ||||
| except (ImportError, ModuleNotFoundError) as ie: | ||||
|     from haystack.utils.import_utils import _optional_component_not_installed | ||||
| 
 | ||||
|     _optional_component_not_installed(__name__, "ocr", ie) | ||||
| 
 | ||||
| from haystack.nodes.file_converter.base import BaseConverter | ||||
| from haystack.nodes.file_converter.image import ImageToTextConverter | ||||
| from haystack.schema import Document | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| @ -52,7 +43,9 @@ class PDFToTextConverter(BaseConverter): | ||||
|             remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys | ||||
|         ) | ||||
|         try: | ||||
|             subprocess.run(["pdftotext", "-v"], shell=False, check=False) | ||||
|             subprocess.run( | ||||
|                 ["pdftotext", "-v"], shell=False, check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL | ||||
|             ) | ||||
|         except FileNotFoundError: | ||||
|             raise FileNotFoundError( | ||||
|                 """pdftotext is not installed. It is part of xpdf or poppler-utils software suite. | ||||
| @ -202,94 +195,3 @@ class PDFToTextConverter(BaseConverter): | ||||
|         pages = pages[:-1]  # the last page in the split is always empty. | ||||
| 
 | ||||
|         return pages | ||||
| 
 | ||||
| 
 | ||||
| class PDFToTextOCRConverter(BaseConverter): | ||||
|     def __init__( | ||||
|         self, | ||||
|         remove_numeric_tables: bool = False, | ||||
|         valid_languages: Optional[List[str]] = None, | ||||
|         id_hash_keys: Optional[List[str]] = None, | ||||
|     ): | ||||
|         """ | ||||
|         Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) | ||||
| 
 | ||||
|         :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. | ||||
|                                       The tabular structures in documents might be noise for the reader model if it | ||||
|                                       does not have table parsing capability for finding answers. However, tables | ||||
|                                       may also have long strings that could possible candidate for searching answers. | ||||
|                                       The rows containing strings are thus retained in this option. | ||||
|         :param valid_languages: validate languages from a list of languages supported by tessarect | ||||
|                                 (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). | ||||
|                                 This option can be used to add test for encoding errors. If the extracted text is | ||||
|                                 not one of the valid languages, then it might likely be encoding error resulting | ||||
|                                 in garbled text. If no value is provided, English will be set as default. | ||||
|         :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's | ||||
|             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are | ||||
|             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). | ||||
|             In this case the id will be generated by using the content and the defined metadata. | ||||
|         """ | ||||
|         if valid_languages is None: | ||||
|             valid_languages = ["eng"] | ||||
|         # init image to text instance | ||||
|         self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages) | ||||
| 
 | ||||
|         super().__init__( | ||||
|             remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys | ||||
|         ) | ||||
| 
 | ||||
|     def convert( | ||||
|         self, | ||||
|         file_path: Path, | ||||
|         meta: Optional[Dict[str, Any]] = None, | ||||
|         remove_numeric_tables: Optional[bool] = None, | ||||
|         valid_languages: Optional[List[str]] = None, | ||||
|         encoding: Optional[str] = None, | ||||
|         id_hash_keys: Optional[List[str]] = None, | ||||
|         start_page: Optional[int] = None, | ||||
|         end_page: Optional[int] = None, | ||||
|     ) -> List[Document]: | ||||
|         """ | ||||
|         Convert a file to a dictionary containing the text and any associated meta data. | ||||
| 
 | ||||
|         File converters may extract file meta like name or size. In addition to it, user | ||||
|         supplied meta data like author, url, external IDs can be supplied as a dictionary. | ||||
| 
 | ||||
|         :param file_path: path of the file to convert | ||||
|         :param meta: dictionary of meta data key-value pairs to append in the returned document. | ||||
|         :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. | ||||
|                                       The tabular structures in documents might be noise for the reader model if it | ||||
|                                       does not have table parsing capability for finding answers. However, tables | ||||
|                                       may also have long strings that could possible candidate for searching answers. | ||||
|                                       The rows containing strings are thus retained in this option. | ||||
|         :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 | ||||
|                                 (https://en.wikipedia.org/wiki/ISO_639-1) format. | ||||
|                                 This option can be used to add test for encoding errors. If the extracted text is | ||||
|                                 not one of the valid languages, then it might likely be encoding error resulting | ||||
|                                 in garbled text. | ||||
|         :param encoding: Not applicable | ||||
|         :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's | ||||
|             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are | ||||
|             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). | ||||
|             In this case the id will be generated by using the content and the defined metadata. | ||||
|         :param start_page: The page number where to start the conversion | ||||
|         :param end_page: The page number where to end the conversion. | ||||
|         """ | ||||
|         if id_hash_keys is None: | ||||
|             id_hash_keys = self.id_hash_keys | ||||
| 
 | ||||
|         start_page = start_page or 1 | ||||
| 
 | ||||
|         pages = [] | ||||
|         try: | ||||
|             images = convert_from_path(file_path, first_page=start_page, last_page=end_page) | ||||
|             for image in images: | ||||
|                 temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg") | ||||
|                 image.save(temp_img.name) | ||||
|                 pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content) | ||||
|         except Exception as exception: | ||||
|             logger.error("File %s has an error:\n%s", file_path, exception) | ||||
| 
 | ||||
|         raw_text = "\f" * (start_page - 1) + "\f".join(pages)  # tracking skipped pages for correct page numbering | ||||
|         document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys) | ||||
|         return [document] | ||||
|  | ||||
							
								
								
									
										108
									
								
								haystack/nodes/file_converter/pdf_ocr.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								haystack/nodes/file_converter/pdf_ocr.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,108 @@ | ||||
| import logging | ||||
| import tempfile | ||||
| from pathlib import Path | ||||
| from typing import Any, Dict, List, Optional | ||||
| 
 | ||||
| try: | ||||
|     from pdf2image import convert_from_path | ||||
| except (ImportError, ModuleNotFoundError) as ie: | ||||
|     from haystack.utils.import_utils import _optional_component_not_installed | ||||
| 
 | ||||
|     _optional_component_not_installed(__name__, "ocr", ie) | ||||
| 
 | ||||
| from haystack.nodes.file_converter.base import BaseConverter | ||||
| from haystack.nodes.file_converter.image import ImageToTextConverter | ||||
| from haystack.schema import Document | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
| 
 | ||||
| 
 | ||||
| class PDFToTextOCRConverter(BaseConverter): | ||||
|     def __init__( | ||||
|         self, | ||||
|         remove_numeric_tables: bool = False, | ||||
|         valid_languages: Optional[List[str]] = None, | ||||
|         id_hash_keys: Optional[List[str]] = None, | ||||
|     ): | ||||
|         """ | ||||
|         Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) | ||||
| 
 | ||||
|         :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. | ||||
|                                       The tabular structures in documents might be noise for the reader model if it | ||||
|                                       does not have table parsing capability for finding answers. However, tables | ||||
|                                       may also have long strings that could possible candidate for searching answers. | ||||
|                                       The rows containing strings are thus retained in this option. | ||||
|         :param valid_languages: validate languages from a list of languages supported by tessarect | ||||
|                                 (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). | ||||
|                                 This option can be used to add test for encoding errors. If the extracted text is | ||||
|                                 not one of the valid languages, then it might likely be encoding error resulting | ||||
|                                 in garbled text. If no value is provided, English will be set as default. | ||||
|         :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's | ||||
|             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are | ||||
|             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). | ||||
|             In this case the id will be generated by using the content and the defined metadata. | ||||
|         """ | ||||
|         if valid_languages is None: | ||||
|             valid_languages = ["eng"] | ||||
|         # init image to text instance | ||||
|         self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages) | ||||
| 
 | ||||
|         super().__init__( | ||||
|             remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages, id_hash_keys=id_hash_keys | ||||
|         ) | ||||
| 
 | ||||
|     def convert( | ||||
|         self, | ||||
|         file_path: Path, | ||||
|         meta: Optional[Dict[str, Any]] = None, | ||||
|         remove_numeric_tables: Optional[bool] = None, | ||||
|         valid_languages: Optional[List[str]] = None, | ||||
|         encoding: Optional[str] = None, | ||||
|         id_hash_keys: Optional[List[str]] = None, | ||||
|         start_page: Optional[int] = None, | ||||
|         end_page: Optional[int] = None, | ||||
|     ) -> List[Document]: | ||||
|         """ | ||||
|         Convert a file to a dictionary containing the text and any associated meta data. | ||||
| 
 | ||||
|         File converters may extract file meta like name or size. In addition to it, user | ||||
|         supplied meta data like author, url, external IDs can be supplied as a dictionary. | ||||
| 
 | ||||
|         :param file_path: path of the file to convert | ||||
|         :param meta: dictionary of meta data key-value pairs to append in the returned document. | ||||
|         :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. | ||||
|                                       The tabular structures in documents might be noise for the reader model if it | ||||
|                                       does not have table parsing capability for finding answers. However, tables | ||||
|                                       may also have long strings that could possible candidate for searching answers. | ||||
|                                       The rows containing strings are thus retained in this option. | ||||
|         :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 | ||||
|                                 (https://en.wikipedia.org/wiki/ISO_639-1) format. | ||||
|                                 This option can be used to add test for encoding errors. If the extracted text is | ||||
|                                 not one of the valid languages, then it might likely be encoding error resulting | ||||
|                                 in garbled text. | ||||
|         :param encoding: Not applicable | ||||
|         :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's | ||||
|             attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are | ||||
|             not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). | ||||
|             In this case the id will be generated by using the content and the defined metadata. | ||||
|         :param start_page: The page number where to start the conversion | ||||
|         :param end_page: The page number where to end the conversion. | ||||
|         """ | ||||
|         if id_hash_keys is None: | ||||
|             id_hash_keys = self.id_hash_keys | ||||
| 
 | ||||
|         start_page = start_page or 1 | ||||
| 
 | ||||
|         pages = [] | ||||
|         try: | ||||
|             images = convert_from_path(file_path, first_page=start_page, last_page=end_page) | ||||
|             for image in images: | ||||
|                 temp_img = tempfile.NamedTemporaryFile(suffix=".jpeg") | ||||
|                 image.save(temp_img.name) | ||||
|                 pages.append(self.image_2_text.convert(file_path=temp_img.name)[0].content) | ||||
|         except Exception as exception: | ||||
|             logger.error("File %s has an error:\n%s", file_path, exception) | ||||
| 
 | ||||
|         raw_text = "\f" * (start_page - 1) + "\f".join(pages)  # tracking skipped pages for correct page numbering | ||||
|         document = Document(content=raw_text, meta=meta, id_hash_keys=id_hash_keys) | ||||
|         return [document] | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Daniel Bichuetti
						Daniel Bichuetti