mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-14 03:31:41 +00:00

Fixed the bug that was coming up due to this commit and ImageReader output being changed due to this commit:- a37df8c221 (diff-d4cdba126dd16c86090107c1c66621a520d20d8962f7f4dd461146cedbb1135d)
77 lines
2.8 KiB
Python
77 lines
2.8 KiB
Python
"""Simple reader that reads flatten PDFs."""
|
|
import os
|
|
import pathlib
|
|
import warnings
|
|
from pathlib import Path
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class FlatPdfReader(BaseReader):
|
|
image_loader: BaseReader
|
|
|
|
def __init__(self, image_loader: BaseReader):
|
|
"""
|
|
:param self: Represent the instance of the class
|
|
:param image_loader: BaseReader: Pass the image_loader object to the class
|
|
:return: An object of the class
|
|
"""
|
|
self.image_loader = image_loader
|
|
|
|
def load_data(self, file: Path) -> Document:
|
|
"""
|
|
The load_data function is the main function of the DataLoader class.
|
|
It takes a PDF file path as input and returns a Document object with text extracted from that PDF.
|
|
|
|
|
|
:param self: Represent the instance of the class
|
|
:param file: Path: The file that we want to load
|
|
:return: A document object
|
|
"""
|
|
import shutil
|
|
try:
|
|
|
|
if not file.is_file() and file.suffix != ".pdf":
|
|
raise Exception("Invalid file")
|
|
|
|
pdf_dir: Path = file
|
|
work_dir: str = str(pathlib.Path().resolve()) + "/flat_pdf/{file_name}".format(
|
|
file_name=file.name.replace(file.suffix, ""))
|
|
pdf_content: str = ""
|
|
|
|
shutil.rmtree(str(pathlib.Path().resolve()) + f"/flat_pdf", ignore_errors=True)
|
|
os.makedirs(work_dir)
|
|
|
|
pdf_pages_count: int = self.convert_pdf_in_images(pdf_dir=pdf_dir, work_dir=work_dir)
|
|
|
|
for page_number in range(0, pdf_pages_count):
|
|
document = self.image_loader.load_data(file=Path(work_dir + f"/page-{page_number}.png"))
|
|
pdf_content += document[0].text
|
|
return Document(text=pdf_content)
|
|
|
|
except Exception as e:
|
|
warnings.warn(
|
|
f"{str(e)}"
|
|
)
|
|
finally:
|
|
shutil.rmtree(str(pathlib.Path().resolve()) + f"/flat_pdf", ignore_errors=True)
|
|
|
|
def convert_pdf_in_images(self, pdf_dir: Path, work_dir: str) -> int:
|
|
"""
|
|
The convert_pdf_in_images function converts a PDF file into images.
|
|
|
|
:param pdf_dir: Path: Specify the path of the pdf file to be converted
|
|
:param work_dir: str: Specify the directory where the images will be saved
|
|
:return: The number of pages in the pdf file
|
|
"""
|
|
import fitz
|
|
zoom_x = 2.0 # horizontal zoom
|
|
zoom_y = 2.0 # vertical zoom
|
|
mat = fitz.Matrix(zoom_x, zoom_y)
|
|
pages = fitz.open(pdf_dir)
|
|
for page in pages: # iterate through the pages
|
|
image = page.get_pixmap(matrix=mat) # render page to an image
|
|
image.save(f"{work_dir}/page-{page.number}.png")
|
|
return pages.page_count
|