mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-27 18:06:17 +00:00
feat: add microsoft pptx file converter (#6399)
* Create pptx.py * feat: pptx converter import __init__.py * feat: add pptx import __init__.py * feat: add python-pptx dependency * feat: add sample pptx for testing * feat: add pptx file-converter test * feat: release note pptx-file-converter-3e494d2747637eb2.yaml * feat: Update releasenotes/notes/pptx-file-converter-3e494d2747637eb2.yaml Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> * feat: refactor haystack/nodes/file_converter/pptx.py Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> * fix imports --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com>
This commit is contained in:
parent
604b177788
commit
c44e2cf49b
@ -7,6 +7,7 @@ from haystack.nodes.file_classifier import FileTypeClassifier
|
||||
from haystack.nodes.file_converter import (
|
||||
BaseConverter,
|
||||
DocxToTextConverter,
|
||||
PptxConverter,
|
||||
ImageToTextConverter,
|
||||
MarkdownConverter,
|
||||
PDFToTextConverter,
|
||||
|
@ -3,6 +3,7 @@ from haystack.nodes.file_converter.base import BaseConverter
|
||||
|
||||
from haystack.nodes.file_converter.csv import CsvTextConverter
|
||||
from haystack.nodes.file_converter.docx import DocxToTextConverter
|
||||
from haystack.nodes.file_converter.pptx import PptxConverter
|
||||
from haystack.nodes.file_converter.json import JsonConverter
|
||||
from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
|
||||
from haystack.nodes.file_converter.txt import TextConverter
|
||||
|
85
haystack/nodes/file_converter/pptx.py
Normal file
85
haystack/nodes/file_converter/pptx.py
Normal file
@ -0,0 +1,85 @@
|
||||
from typing import List, Optional, Dict
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
from haystack.schema import Document
|
||||
from haystack.lazy_imports import LazyImport
|
||||
from haystack.nodes.file_converter.base import BaseConverter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
with LazyImport("Run 'pip install python-pptx'") as pptx_import:
|
||||
from pptx import Presentation
|
||||
|
||||
|
||||
class PptxConverter(BaseConverter):
|
||||
def __init__(
|
||||
self,
|
||||
remove_numeric_tables: bool = False,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
progress_bar: bool = True,
|
||||
):
|
||||
pptx_import.check()
|
||||
super().__init__(
|
||||
remove_numeric_tables=remove_numeric_tables,
|
||||
valid_languages=valid_languages,
|
||||
id_hash_keys=id_hash_keys,
|
||||
progress_bar=progress_bar,
|
||||
)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_path: Path,
|
||||
meta: Optional[Dict[str, str]] = None,
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = None,
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Extract text from a .pptx file.
|
||||
Note: As pptx doesn't contain "page" information, we actually extract and return a list of texts from each slide here.
|
||||
For compliance with other converters we nevertheless opted for keeping the methods name.
|
||||
|
||||
:param file_path: Path to the .pptx file you want to convert
|
||||
:param meta: dictionary of meta data key-value pairs to append in the returned document.
|
||||
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||
The tabular structures in documents might be noise for the reader model if it
|
||||
does not have table parsing capability for finding answers. However, tables
|
||||
may also have long strings that could possible candidate for searching answers.
|
||||
The rows containing strings are thus retained in this option.
|
||||
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||
This option can be used to add test for encoding errors. If the extracted text is
|
||||
not one of the valid languages, then it might likely be encoding error resulting
|
||||
in garbled text.
|
||||
:param encoding: Not applicable
|
||||
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||
In this case the id will be generated by using the content and the defined metadata.
|
||||
"""
|
||||
if remove_numeric_tables is None:
|
||||
remove_numeric_tables = self.remove_numeric_tables
|
||||
if valid_languages is None:
|
||||
valid_languages = self.valid_languages
|
||||
if remove_numeric_tables is True:
|
||||
raise Exception("'remove_numeric_tables' is not supported by PptxToTextConverter.")
|
||||
if valid_languages is True:
|
||||
raise Exception("Language validation using 'valid_languages' is not supported by PptxToTextConverter.")
|
||||
if id_hash_keys is None:
|
||||
id_hash_keys = self.id_hash_keys
|
||||
|
||||
pres = Presentation(file_path)
|
||||
text_parts = []
|
||||
for slide in pres.slides:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
text_parts.append(shape.text)
|
||||
text = "\n".join(text_parts)
|
||||
|
||||
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
|
||||
return [document]
|
@ -165,6 +165,7 @@ preprocessing = [
|
||||
file-conversion = [
|
||||
"azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator)
|
||||
"python-docx",
|
||||
"python-pptx",
|
||||
"tika", # Apache Tika (text & metadata extractor)
|
||||
"beautifulsoup4",
|
||||
"markdown",
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Add PptxConverter: a node to convert pptx files to Haystack Documents.
|
@ -16,6 +16,7 @@ from haystack.nodes import (
|
||||
AzureConverter,
|
||||
CsvTextConverter,
|
||||
DocxToTextConverter,
|
||||
PptxConverter,
|
||||
JsonConverter,
|
||||
MarkdownConverter,
|
||||
ParsrConverter,
|
||||
@ -207,6 +208,13 @@ def test_docx_converter(samples_path):
|
||||
assert document.content.startswith("Sample Docx File")
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_pptx_converter(samples_path):
|
||||
converter = PptxConverter()
|
||||
document = converter.convert(file_path=samples_path / "pptx" / "sample_pptx.pptx")[0]
|
||||
assert document.content.startswith("Sample Pptx File")
|
||||
|
||||
|
||||
@pytest.mark.unit
|
||||
def test_markdown_converter(samples_path):
|
||||
converter = MarkdownConverter()
|
||||
|
BIN
test/samples/pptx/sample_pptx.pptx
Normal file
BIN
test/samples/pptx/sample_pptx.pptx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user