mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-15 20:21:14 +00:00
39 lines
1.1 KiB
Python
39 lines
1.1 KiB
Python
"""Read PDF files."""
|
|
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class PDFReader(BaseReader):
|
|
"""PDF reader."""
|
|
|
|
def load_data(
|
|
self, file: Path, extra_info: Optional[Dict] = None
|
|
) -> List[Document]:
|
|
"""Parse file."""
|
|
import pypdf
|
|
|
|
with open(file, "rb") as fp:
|
|
# Create a PDF object
|
|
pdf = pypdf.PdfReader(fp)
|
|
|
|
# Get the number of pages in the PDF document
|
|
num_pages = len(pdf.pages)
|
|
|
|
# Iterate over every page
|
|
docs = []
|
|
for page in range(num_pages):
|
|
# Extract the text from the page
|
|
page_text = pdf.pages[page].extract_text()
|
|
page_label = pdf.page_labels[page]
|
|
metadata = {"page_label": page_label, "file_name":file.name}
|
|
|
|
if extra_info is not None:
|
|
metadata.update(extra_info)
|
|
|
|
docs.append(Document(page_text, extra_info=metadata))
|
|
return docs
|