mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-15 12:11:50 +00:00
84 lines
2.6 KiB
Python
84 lines
2.6 KiB
Python
![]() |
"""Read PDF files."""
|
||
|
|
||
|
from pathlib import Path
|
||
|
from typing import Dict, List, Any, Optional
|
||
|
|
||
|
from gpt_index.readers.base import BaseReader
|
||
|
from gpt_index.readers.schema.base import Document
|
||
|
|
||
|
|
||
|
class CJKPDFReader(BaseReader):
|
||
|
"""CJK PDF reader.
|
||
|
|
||
|
Extract text from PDF including CJK (Chinese, Japanese and Korean) languages using pdfminer.six.
|
||
|
|
||
|
Args:
|
||
|
concat_pages (bool): whether to concatenate all pages into one document.
|
||
|
If set to False, a Document will be created for each page.
|
||
|
True by default.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
*args: Any,
|
||
|
concat_pages: bool = True,
|
||
|
**kwargs: Any
|
||
|
) -> None:
|
||
|
"""Init params."""
|
||
|
super().__init__(*args, **kwargs)
|
||
|
self._concat_pages = concat_pages
|
||
|
|
||
|
# Define a function to extract text from PDF
|
||
|
def _extract_text_by_page(self, pdf_path: Path) -> List[str]:
|
||
|
# Import pdfminer
|
||
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||
|
from pdfminer.converter import TextConverter
|
||
|
from pdfminer.layout import LAParams
|
||
|
from pdfminer.pdfpage import PDFPage
|
||
|
from io import StringIO
|
||
|
# Create a resource manager
|
||
|
rsrcmgr = PDFResourceManager()
|
||
|
# Create an object to store the text
|
||
|
retstr = StringIO()
|
||
|
# Create a text converter
|
||
|
codec = 'utf-8'
|
||
|
laparams = LAParams()
|
||
|
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
|
||
|
# Create a PDF interpreter
|
||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||
|
# Open the PDF file
|
||
|
fp = open(pdf_path, 'rb')
|
||
|
# Create a list to store the text of each page
|
||
|
text_list = []
|
||
|
# Extract text from each page
|
||
|
for page in PDFPage.get_pages(fp):
|
||
|
interpreter.process_page(page)
|
||
|
# Get the text
|
||
|
text = retstr.getvalue()
|
||
|
# Add the text to the list
|
||
|
text_list.append(text)
|
||
|
# Clear the text
|
||
|
retstr.truncate(0)
|
||
|
retstr.seek(0)
|
||
|
# Close the file
|
||
|
fp.close()
|
||
|
# Close the device
|
||
|
device.close()
|
||
|
# Return the text list
|
||
|
return text_list
|
||
|
|
||
|
def load_data(
|
||
|
self, file: Path, extra_info: Optional[Dict] = None
|
||
|
) -> List[Document]:
|
||
|
"""Parse file."""
|
||
|
|
||
|
text_list = self._extract_text_by_page(file)
|
||
|
|
||
|
if self._concat_pages:
|
||
|
return [Document("\n".join(text_list), extra_info=extra_info)]
|
||
|
else:
|
||
|
return [Document(text, extra_info=extra_info) for text in text_list]
|
||
|
|
||
|
|
||
|
|