llama-hub/loader_hub/file/pptx/base.py

"""Read Microsoft PowerPoint files."""

import os
from pathlib import Path
from typing import Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document


class PptxReader(BaseReader):
    """Powerpoint reader.

    Extract text, caption images, and specify slides.

    """

    def __init__(self, caption_images: Optional[bool] = False) -> None:
        """Init reader."""
        self.caption_images = caption_images
        if caption_images:
            from transformers import (AutoTokenizer, VisionEncoderDecoderModel,
                                      ViTFeatureExtractor)

            model = VisionEncoderDecoderModel.from_pretrained(
                "nlpconnect/vit-gpt2-image-captioning"
            )
            feature_extractor = ViTFeatureExtractor.from_pretrained(
                "nlpconnect/vit-gpt2-image-captioning"
            )
            tokenizer = AutoTokenizer.from_pretrained(
                "nlpconnect/vit-gpt2-image-captioning"
            )

            self.parser_config = {
                "feature_extractor": feature_extractor,
                "model": model,
                "tokenizer": tokenizer,
            }

    def generate_image_caption(self, tmp_image_file: str) -> str:
        """Generate text caption of image."""
        if not self.caption_images:
            return ""

        import torch
        from PIL import Image

        model = self.parser_config["model"]
        feature_extractor = self.parser_config["feature_extractor"]
        tokenizer = self.parser_config["tokenizer"]

        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)

        max_length = 16
        num_beams = 4
        gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

        i_image = Image.open(tmp_image_file)
        if i_image.mode != "RGB":
            i_image = i_image.convert(mode="RGB")

        pixel_values = feature_extractor(
            images=[i_image], return_tensors="pt"
        ).pixel_values
        pixel_values = pixel_values.to(device)

        output_ids = model.generate(pixel_values, **gen_kwargs)

        preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        return preds[0].strip()

    def load_data(
        self,
        file: Path,
        extra_info: Optional[Dict] = None,
    ) -> List[Document]:
        """Parse file."""
        from pptx import Presentation

        presentation = Presentation(file)
        result = ""
        for i, slide in enumerate(presentation.slides):
            result += f"\n\nSlide #{i}: \n"
            for shape in slide.shapes:
                if self.caption_images and hasattr(shape, "image"):
                    image = shape.image
                    # get image "file" contents
                    image_bytes = image.blob
                    # temporarily save the image to feed into model
                    image_filename = f"tmp_image.{image.ext}"
                    with open(image_filename, "wb") as f:
                        f.write(image_bytes)
                    result += (
                        f"\n Image: {self.generate_image_caption(image_filename)}\n\n"
                    )

                    os.remove(image_filename)
                if hasattr(shape, "text"):
                    result += f"{shape.text}\n"

        return [Document(result, extra_info=extra_info)]