40 lines
1.2 KiB
Python
Raw Normal View History

2023-04-27 01:26:26 -07:00
"""Deepdoctection Data Reader."""
from pathlib import Path
from typing import Optional, Set
from typing import Dict, List
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class DeepDoctectionReader(BaseReader):
"""Deepdoctection reader for pdf's.
Uses deepdoctection as a library to parse PDF files.
"""
def __init__(self, attrs_as_metadata: Optional[Set] = None) -> None:
"""Init params."""
import deepdoctection as dd
self.analyzer = dd.get_dd_analyzer()
self.attrs_as_metadata = attrs_as_metadata or set()
def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
df = self.analyzer.analyze(path=str(file))
df.reset_state()
doc = iter(df)
result_docs = []
for page in doc:
doc_text = page.text
extra_info = {
k: getattr(page, k) for k in self.attrs_as_metadata if hasattr(page, k)
}
result_docs.append(
Document(doc_text, extra_info=extra_info)
)
return result_docs