mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-12-09 22:17:55 +00:00
updated extra_info dict and readme file (#250)
This commit is contained in:
parent
a3255b4e0a
commit
fc6a8c04f4
@ -8,12 +8,12 @@ To use this loader, you need to pass file path of the local file as string or `P
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from gpt_index import download_loader
|
||||
from llama_index import download_loader
|
||||
|
||||
PyMuPDFReader = download_loader("PyMuPDFReader")
|
||||
|
||||
loader = PyMuPDFReader()
|
||||
documents = loader.load(file=Path('./article.pdf'), metadata=True)
|
||||
documents = loader.load(file_path=Path('./article.pdf'), metadata=True)
|
||||
```
|
||||
|
||||
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
||||
|
||||
@ -45,24 +45,19 @@ class PyMuPDFReader(BaseReader):
|
||||
|
||||
# if metadata is True, add metadata to each document
|
||||
if metadata:
|
||||
metadata_dict = {}
|
||||
metadata_dict["total_pages"] = len(doc)
|
||||
metadata_dict["file_path"] = self.file_path
|
||||
|
||||
# add extra_info to metadata_dict
|
||||
if not extra_info:
|
||||
extra_info = metadata_dict
|
||||
else:
|
||||
extra_info = dict(extra_info, **metadata_dict)
|
||||
extra_info = {}
|
||||
extra_info["total_pages"] = len(doc)
|
||||
extra_info["file_path"] = file_path
|
||||
|
||||
# return list of documents
|
||||
return [
|
||||
Document(
|
||||
page.get_text().encode("utf-8"),
|
||||
text=page.get_text().encode("utf-8"),
|
||||
extra_info=dict(
|
||||
extra_info,
|
||||
**{
|
||||
metadata_dict["source"]: f"{page.number+1}",
|
||||
"source": f"{page.number+1}",
|
||||
},
|
||||
),
|
||||
)
|
||||
@ -71,6 +66,6 @@ class PyMuPDFReader(BaseReader):
|
||||
|
||||
else:
|
||||
return [
|
||||
Document(page.get_text().encode("utf-8"), extra_info=extra_info)
|
||||
Document(text=page.get_text().encode("utf-8"), extra_info=extra_info)
|
||||
for page in doc
|
||||
]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user