diff --git a/haystack/nodes/__init__.py b/haystack/nodes/__init__.py index a969e245b..9ac1f3268 100644 --- a/haystack/nodes/__init__.py +++ b/haystack/nodes/__init__.py @@ -7,6 +7,7 @@ from haystack.nodes.file_classifier import FileTypeClassifier from haystack.nodes.file_converter import ( BaseConverter, DocxToTextConverter, + PptxConverter, ImageToTextConverter, MarkdownConverter, PDFToTextConverter, diff --git a/haystack/nodes/file_converter/__init__.py b/haystack/nodes/file_converter/__init__.py index 0c24dbcdf..76a5dd1aa 100644 --- a/haystack/nodes/file_converter/__init__.py +++ b/haystack/nodes/file_converter/__init__.py @@ -3,6 +3,7 @@ from haystack.nodes.file_converter.base import BaseConverter from haystack.nodes.file_converter.csv import CsvTextConverter from haystack.nodes.file_converter.docx import DocxToTextConverter +from haystack.nodes.file_converter.pptx import PptxConverter from haystack.nodes.file_converter.json import JsonConverter from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser from haystack.nodes.file_converter.txt import TextConverter diff --git a/haystack/nodes/file_converter/pptx.py b/haystack/nodes/file_converter/pptx.py new file mode 100644 index 000000000..84fe46b1d --- /dev/null +++ b/haystack/nodes/file_converter/pptx.py @@ -0,0 +1,85 @@ +from typing import List, Optional, Dict +from pathlib import Path +import logging + +from haystack.schema import Document +from haystack.lazy_imports import LazyImport +from haystack.nodes.file_converter.base import BaseConverter + + +logger = logging.getLogger(__name__) + + +with LazyImport("Run 'pip install python-pptx'") as pptx_import: + from pptx import Presentation + + +class PptxConverter(BaseConverter): + def __init__( + self, + remove_numeric_tables: bool = False, + valid_languages: Optional[List[str]] = None, + id_hash_keys: Optional[List[str]] = None, + progress_bar: bool = True, + ): + pptx_import.check() + super().__init__( + remove_numeric_tables=remove_numeric_tables, + valid_languages=valid_languages, + id_hash_keys=id_hash_keys, + progress_bar=progress_bar, + ) + + def convert( + self, + file_path: Path, + meta: Optional[Dict[str, str]] = None, + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = None, + id_hash_keys: Optional[List[str]] = None, + ) -> List[Document]: + """ + Extract text from a .pptx file. + Note: As pptx doesn't contain "page" information, we actually extract and return a list of texts from each slide here. + For compliance with other converters we nevertheless opted for keeping the methods name. + + :param file_path: Path to the .pptx file you want to convert + :param meta: dictionary of meta data key-value pairs to append in the returned document. + :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables. + The tabular structures in documents might be noise for the reader model if it + does not have table parsing capability for finding answers. However, tables + may also have long strings that could possible candidate for searching answers. + The rows containing strings are thus retained in this option. + :param valid_languages: validate languages from a list of languages specified in the ISO 639-1 + (https://en.wikipedia.org/wiki/ISO_639-1) format. + This option can be used to add test for encoding errors. If the extracted text is + not one of the valid languages, then it might likely be encoding error resulting + in garbled text. + :param encoding: Not applicable + :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's + attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are + not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). + In this case the id will be generated by using the content and the defined metadata. + """ + if remove_numeric_tables is None: + remove_numeric_tables = self.remove_numeric_tables + if valid_languages is None: + valid_languages = self.valid_languages + if remove_numeric_tables is True: + raise Exception("'remove_numeric_tables' is not supported by PptxToTextConverter.") + if valid_languages is True: + raise Exception("Language validation using 'valid_languages' is not supported by PptxToTextConverter.") + if id_hash_keys is None: + id_hash_keys = self.id_hash_keys + + pres = Presentation(file_path) + text_parts = [] + for slide in pres.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + text_parts.append(shape.text) + text = "\n".join(text_parts) + + document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys) + return [document] diff --git a/pyproject.toml b/pyproject.toml index 06982853b..8def0a45c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -165,6 +165,7 @@ preprocessing = [ file-conversion = [ "azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator) "python-docx", + "python-pptx", "tika", # Apache Tika (text & metadata extractor) "beautifulsoup4", "markdown", diff --git a/releasenotes/notes/pptx-file-converter-3e494d2747637eb2.yaml b/releasenotes/notes/pptx-file-converter-3e494d2747637eb2.yaml new file mode 100644 index 000000000..6090da6d4 --- /dev/null +++ b/releasenotes/notes/pptx-file-converter-3e494d2747637eb2.yaml @@ -0,0 +1,4 @@ +--- + features: + - | + Add PptxConverter: a node to convert pptx files to Haystack Documents. diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index c61ed90c4..9daee4b58 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -16,6 +16,7 @@ from haystack.nodes import ( AzureConverter, CsvTextConverter, DocxToTextConverter, + PptxConverter, JsonConverter, MarkdownConverter, ParsrConverter, @@ -207,6 +208,13 @@ def test_docx_converter(samples_path): assert document.content.startswith("Sample Docx File") +@pytest.mark.unit +def test_pptx_converter(samples_path): + converter = PptxConverter() + document = converter.convert(file_path=samples_path / "pptx" / "sample_pptx.pptx")[0] + assert document.content.startswith("Sample Pptx File") + + @pytest.mark.unit def test_markdown_converter(samples_path): converter = MarkdownConverter() diff --git a/test/samples/pptx/sample_pptx.pptx b/test/samples/pptx/sample_pptx.pptx new file mode 100644 index 000000000..f34bde75e Binary files /dev/null and b/test/samples/pptx/sample_pptx.pptx differ