mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-08-27 09:56:37 +00:00
feat: add microsoft pptx file converter (#6399)
* Create pptx.py * feat: pptx converter import __init__.py * feat: add pptx import __init__.py * feat: add python-pptx dependency * feat: add sample pptx for testing * feat: add pptx file-converter test * feat: release note pptx-file-converter-3e494d2747637eb2.yaml * feat: Update releasenotes/notes/pptx-file-converter-3e494d2747637eb2.yaml Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> * feat: refactor haystack/nodes/file_converter/pptx.py Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com> * fix imports --------- Co-authored-by: Stefano Fiorucci <44616784+anakin87@users.noreply.github.com>
This commit is contained in:
parent
604b177788
commit
c44e2cf49b
@ -7,6 +7,7 @@ from haystack.nodes.file_classifier import FileTypeClassifier
|
|||||||
from haystack.nodes.file_converter import (
|
from haystack.nodes.file_converter import (
|
||||||
BaseConverter,
|
BaseConverter,
|
||||||
DocxToTextConverter,
|
DocxToTextConverter,
|
||||||
|
PptxConverter,
|
||||||
ImageToTextConverter,
|
ImageToTextConverter,
|
||||||
MarkdownConverter,
|
MarkdownConverter,
|
||||||
PDFToTextConverter,
|
PDFToTextConverter,
|
||||||
|
@ -3,6 +3,7 @@ from haystack.nodes.file_converter.base import BaseConverter
|
|||||||
|
|
||||||
from haystack.nodes.file_converter.csv import CsvTextConverter
|
from haystack.nodes.file_converter.csv import CsvTextConverter
|
||||||
from haystack.nodes.file_converter.docx import DocxToTextConverter
|
from haystack.nodes.file_converter.docx import DocxToTextConverter
|
||||||
|
from haystack.nodes.file_converter.pptx import PptxConverter
|
||||||
from haystack.nodes.file_converter.json import JsonConverter
|
from haystack.nodes.file_converter.json import JsonConverter
|
||||||
from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
|
from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
|
||||||
from haystack.nodes.file_converter.txt import TextConverter
|
from haystack.nodes.file_converter.txt import TextConverter
|
||||||
|
85
haystack/nodes/file_converter/pptx.py
Normal file
85
haystack/nodes/file_converter/pptx.py
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
from typing import List, Optional, Dict
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from haystack.schema import Document
|
||||||
|
from haystack.lazy_imports import LazyImport
|
||||||
|
from haystack.nodes.file_converter.base import BaseConverter
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
with LazyImport("Run 'pip install python-pptx'") as pptx_import:
|
||||||
|
from pptx import Presentation
|
||||||
|
|
||||||
|
|
||||||
|
class PptxConverter(BaseConverter):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
remove_numeric_tables: bool = False,
|
||||||
|
valid_languages: Optional[List[str]] = None,
|
||||||
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
|
progress_bar: bool = True,
|
||||||
|
):
|
||||||
|
pptx_import.check()
|
||||||
|
super().__init__(
|
||||||
|
remove_numeric_tables=remove_numeric_tables,
|
||||||
|
valid_languages=valid_languages,
|
||||||
|
id_hash_keys=id_hash_keys,
|
||||||
|
progress_bar=progress_bar,
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert(
|
||||||
|
self,
|
||||||
|
file_path: Path,
|
||||||
|
meta: Optional[Dict[str, str]] = None,
|
||||||
|
remove_numeric_tables: Optional[bool] = None,
|
||||||
|
valid_languages: Optional[List[str]] = None,
|
||||||
|
encoding: Optional[str] = None,
|
||||||
|
id_hash_keys: Optional[List[str]] = None,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""
|
||||||
|
Extract text from a .pptx file.
|
||||||
|
Note: As pptx doesn't contain "page" information, we actually extract and return a list of texts from each slide here.
|
||||||
|
For compliance with other converters we nevertheless opted for keeping the methods name.
|
||||||
|
|
||||||
|
:param file_path: Path to the .pptx file you want to convert
|
||||||
|
:param meta: dictionary of meta data key-value pairs to append in the returned document.
|
||||||
|
:param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
|
||||||
|
The tabular structures in documents might be noise for the reader model if it
|
||||||
|
does not have table parsing capability for finding answers. However, tables
|
||||||
|
may also have long strings that could possible candidate for searching answers.
|
||||||
|
The rows containing strings are thus retained in this option.
|
||||||
|
:param valid_languages: validate languages from a list of languages specified in the ISO 639-1
|
||||||
|
(https://en.wikipedia.org/wiki/ISO_639-1) format.
|
||||||
|
This option can be used to add test for encoding errors. If the extracted text is
|
||||||
|
not one of the valid languages, then it might likely be encoding error resulting
|
||||||
|
in garbled text.
|
||||||
|
:param encoding: Not applicable
|
||||||
|
:param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
|
||||||
|
attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
|
||||||
|
not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
|
||||||
|
In this case the id will be generated by using the content and the defined metadata.
|
||||||
|
"""
|
||||||
|
if remove_numeric_tables is None:
|
||||||
|
remove_numeric_tables = self.remove_numeric_tables
|
||||||
|
if valid_languages is None:
|
||||||
|
valid_languages = self.valid_languages
|
||||||
|
if remove_numeric_tables is True:
|
||||||
|
raise Exception("'remove_numeric_tables' is not supported by PptxToTextConverter.")
|
||||||
|
if valid_languages is True:
|
||||||
|
raise Exception("Language validation using 'valid_languages' is not supported by PptxToTextConverter.")
|
||||||
|
if id_hash_keys is None:
|
||||||
|
id_hash_keys = self.id_hash_keys
|
||||||
|
|
||||||
|
pres = Presentation(file_path)
|
||||||
|
text_parts = []
|
||||||
|
for slide in pres.slides:
|
||||||
|
for shape in slide.shapes:
|
||||||
|
if hasattr(shape, "text"):
|
||||||
|
text_parts.append(shape.text)
|
||||||
|
text = "\n".join(text_parts)
|
||||||
|
|
||||||
|
document = Document(content=text, meta=meta, id_hash_keys=id_hash_keys)
|
||||||
|
return [document]
|
@ -165,6 +165,7 @@ preprocessing = [
|
|||||||
file-conversion = [
|
file-conversion = [
|
||||||
"azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator)
|
"azure-ai-formrecognizer>=3.2.0b2", # Microsoft Azure's Form Recognizer service (text and table exctrator)
|
||||||
"python-docx",
|
"python-docx",
|
||||||
|
"python-pptx",
|
||||||
"tika", # Apache Tika (text & metadata extractor)
|
"tika", # Apache Tika (text & metadata extractor)
|
||||||
"beautifulsoup4",
|
"beautifulsoup4",
|
||||||
"markdown",
|
"markdown",
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
features:
|
||||||
|
- |
|
||||||
|
Add PptxConverter: a node to convert pptx files to Haystack Documents.
|
@ -16,6 +16,7 @@ from haystack.nodes import (
|
|||||||
AzureConverter,
|
AzureConverter,
|
||||||
CsvTextConverter,
|
CsvTextConverter,
|
||||||
DocxToTextConverter,
|
DocxToTextConverter,
|
||||||
|
PptxConverter,
|
||||||
JsonConverter,
|
JsonConverter,
|
||||||
MarkdownConverter,
|
MarkdownConverter,
|
||||||
ParsrConverter,
|
ParsrConverter,
|
||||||
@ -207,6 +208,13 @@ def test_docx_converter(samples_path):
|
|||||||
assert document.content.startswith("Sample Docx File")
|
assert document.content.startswith("Sample Docx File")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.unit
|
||||||
|
def test_pptx_converter(samples_path):
|
||||||
|
converter = PptxConverter()
|
||||||
|
document = converter.convert(file_path=samples_path / "pptx" / "sample_pptx.pptx")[0]
|
||||||
|
assert document.content.startswith("Sample Pptx File")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.unit
|
@pytest.mark.unit
|
||||||
def test_markdown_converter(samples_path):
|
def test_markdown_converter(samples_path):
|
||||||
converter = MarkdownConverter()
|
converter = MarkdownConverter()
|
||||||
|
BIN
test/samples/pptx/sample_pptx.pptx
Normal file
BIN
test/samples/pptx/sample_pptx.pptx
Normal file
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user