mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-11-09 22:34:33 +00:00
Merge pull request #97 from reletreby/main
This commit is contained in:
commit
94e5baae7a
@ -15,4 +15,15 @@ loader = ArxivReader()
|
|||||||
documents = loader.load_data(search_query='au:Karpathy')
|
documents = loader.load_data(search_query='au:Karpathy')
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Alternatively, if you would like to load papers and abstracts separately:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from llama_index import download_loader
|
||||||
|
|
||||||
|
ArxivReader = download_loader("ArxivReader")
|
||||||
|
|
||||||
|
loader = ArxivReader()
|
||||||
|
documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy')
|
||||||
|
```
|
||||||
|
|
||||||
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from llama_index import download_loader
|
from llama_index import download_loader
|
||||||
from llama_index.readers.base import BaseReader
|
from llama_index.readers.base import BaseReader
|
||||||
@ -93,3 +93,74 @@ class ArxivReader(BaseReader):
|
|||||||
print("Unable to delete files or directory")
|
print("Unable to delete files or directory")
|
||||||
|
|
||||||
return arxiv_documents + abstract_documents
|
return arxiv_documents + abstract_documents
|
||||||
|
|
||||||
|
|
||||||
|
def load_papers_and_abstracts(
|
||||||
|
self,
|
||||||
|
search_query: str,
|
||||||
|
papers_dir: Optional[str] = ".papers",
|
||||||
|
max_results: Optional[int] = 10,
|
||||||
|
) -> Tuple[List[Document], List[Document]]:
|
||||||
|
"""Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search_query (str): A topic to search for (e.g. "Artificial Intelligence").
|
||||||
|
papers_dir (Optional[str]): Locally directory to store the papers
|
||||||
|
max_results (Optional[int]): Maximum number of papers to fetch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: A list of Document objects representing the papers themselves
|
||||||
|
List[Document]: A list of Document objects representing abstracts only
|
||||||
|
"""
|
||||||
|
import arxiv
|
||||||
|
|
||||||
|
arxiv_search = arxiv.Search(
|
||||||
|
query=search_query,
|
||||||
|
id_list=[],
|
||||||
|
max_results=max_results,
|
||||||
|
sort_by=arxiv.SortCriterion.Relevance,
|
||||||
|
)
|
||||||
|
search_results = list(arxiv_search.results())
|
||||||
|
logging.debug(f"> Successfully fetched {len(search_results)} paperes")
|
||||||
|
|
||||||
|
if not os.path.exists(papers_dir):
|
||||||
|
os.makedirs(papers_dir)
|
||||||
|
|
||||||
|
paper_lookup = {}
|
||||||
|
for paper in search_results:
|
||||||
|
# Hash filename to avoid bad charaters in file path
|
||||||
|
filename = f"{self._hacky_hash(paper.title)}.pdf"
|
||||||
|
paper_lookup[os.path.join(papers_dir, filename)] = {
|
||||||
|
"Title of this paper": paper.title,
|
||||||
|
"Authors": (", ").join([a.name for a in paper.authors]),
|
||||||
|
"Date published": paper.published.strftime("%m/%d/%Y"),
|
||||||
|
"URL": paper.entry_id,
|
||||||
|
# "summary": paper.summary
|
||||||
|
}
|
||||||
|
paper.download_pdf(dirpath=papers_dir, filename=filename)
|
||||||
|
logging.debug(f"> Downloading {filename}...")
|
||||||
|
|
||||||
|
def get_paper_metadata(filename):
|
||||||
|
return paper_lookup[filename]
|
||||||
|
|
||||||
|
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
|
||||||
|
arxiv_documents = SimpleDirectoryReader(
|
||||||
|
papers_dir, file_metadata=get_paper_metadata
|
||||||
|
).load_data()
|
||||||
|
# Include extra documents containing the abstracts
|
||||||
|
abstract_documents = []
|
||||||
|
for paper in search_results:
|
||||||
|
d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}"
|
||||||
|
abstract_documents.append(Document(d))
|
||||||
|
|
||||||
|
# Delete downloaded papers
|
||||||
|
try:
|
||||||
|
for f in os.listdir(papers_dir):
|
||||||
|
os.remove(os.path.join(papers_dir, f))
|
||||||
|
logging.debug(f"> Deleted file: {f}")
|
||||||
|
os.rmdir(papers_dir)
|
||||||
|
logging.debug(f"> Deleted directory: {papers_dir}")
|
||||||
|
except OSError:
|
||||||
|
print("Unable to delete files or directory")
|
||||||
|
|
||||||
|
return arxiv_documents, abstract_documents
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user