updated extra_info dict and readme file (#250)

This commit is contained in:
Arun Brahma 2023-05-11 00:46:29 +05:30 committed by GitHub
parent a3255b4e0a
commit fc6a8c04f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 8 additions and 13 deletions

View File

@ -8,12 +8,12 @@ To use this loader, you need to pass file path of the local file as string or `P
```python
from pathlib import Path
from gpt_index import download_loader
from llama_index import download_loader
PyMuPDFReader = download_loader("PyMuPDFReader")
loader = PyMuPDFReader()
documents = loader.load(file=Path('./article.pdf'), metadata=True)
documents = loader.load(file_path=Path('./article.pdf'), metadata=True)
```
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.

View File

@ -45,24 +45,19 @@ class PyMuPDFReader(BaseReader):
# if metadata is True, add metadata to each document
if metadata:
metadata_dict = {}
metadata_dict["total_pages"] = len(doc)
metadata_dict["file_path"] = self.file_path
# add extra_info to metadata_dict
if not extra_info:
extra_info = metadata_dict
else:
extra_info = dict(extra_info, **metadata_dict)
extra_info = {}
extra_info["total_pages"] = len(doc)
extra_info["file_path"] = file_path
# return list of documents
return [
Document(
page.get_text().encode("utf-8"),
text=page.get_text().encode("utf-8"),
extra_info=dict(
extra_info,
**{
metadata_dict["source"]: f"{page.number+1}",
"source": f"{page.number+1}",
},
),
)
@ -71,6 +66,6 @@ class PyMuPDFReader(BaseReader):
else:
return [
Document(page.get_text().encode("utf-8"), extra_info=extra_info)
Document(text=page.get_text().encode("utf-8"), extra_info=extra_info)
for page in doc
]