84 lines
2.6 KiB
Python
Raw Normal View History

"""Read PDF files."""
from pathlib import Path
from typing import Dict, List, Any, Optional
from gpt_index.readers.base import BaseReader
from gpt_index.readers.schema.base import Document
class CJKPDFReader(BaseReader):
"""CJK PDF reader.
Extract text from PDF including CJK (Chinese, Japanese and Korean) languages using pdfminer.six.
Args:
concat_pages (bool): whether to concatenate all pages into one document.
If set to False, a Document will be created for each page.
True by default.
"""
def __init__(
self,
*args: Any,
concat_pages: bool = True,
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_pages = concat_pages
# Define a function to extract text from PDF
def _extract_text_by_page(self, pdf_path: Path) -> List[str]:
# Import pdfminer
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
# Create a resource manager
rsrcmgr = PDFResourceManager()
# Create an object to store the text
retstr = StringIO()
# Create a text converter
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
# Create a PDF interpreter
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Open the PDF file
fp = open(pdf_path, 'rb')
# Create a list to store the text of each page
text_list = []
# Extract text from each page
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
# Get the text
text = retstr.getvalue()
# Add the text to the list
text_list.append(text)
# Clear the text
retstr.truncate(0)
retstr.seek(0)
# Close the file
fp.close()
# Close the device
device.close()
# Return the text list
return text_list
def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
text_list = self._extract_text_by_page(file)
if self._concat_pages:
return [Document("\n".join(text_list), extra_info=extra_info)]
else:
return [Document(text, extra_info=extra_info) for text in text_list]