mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-11-09 14:24:08 +00:00
Merge pull request #97 from reletreby/main
This commit is contained in:
commit
94e5baae7a
@ -15,4 +15,15 @@ loader = ArxivReader()
|
||||
documents = loader.load_data(search_query='au:Karpathy')
|
||||
```
|
||||
|
||||
Alternatively, if you would like to load papers and abstracts separately:
|
||||
|
||||
```python
|
||||
from llama_index import download_loader
|
||||
|
||||
ArxivReader = download_loader("ArxivReader")
|
||||
|
||||
loader = ArxivReader()
|
||||
documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy')
|
||||
```
|
||||
|
||||
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from llama_index import download_loader
|
||||
from llama_index.readers.base import BaseReader
|
||||
@ -93,3 +93,74 @@ class ArxivReader(BaseReader):
|
||||
print("Unable to delete files or directory")
|
||||
|
||||
return arxiv_documents + abstract_documents
|
||||
|
||||
|
||||
def load_papers_and_abstracts(
|
||||
self,
|
||||
search_query: str,
|
||||
papers_dir: Optional[str] = ".papers",
|
||||
max_results: Optional[int] = 10,
|
||||
) -> Tuple[List[Document], List[Document]]:
|
||||
"""Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.
|
||||
|
||||
Args:
|
||||
search_query (str): A topic to search for (e.g. "Artificial Intelligence").
|
||||
papers_dir (Optional[str]): Locally directory to store the papers
|
||||
max_results (Optional[int]): Maximum number of papers to fetch.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects representing the papers themselves
|
||||
List[Document]: A list of Document objects representing abstracts only
|
||||
"""
|
||||
import arxiv
|
||||
|
||||
arxiv_search = arxiv.Search(
|
||||
query=search_query,
|
||||
id_list=[],
|
||||
max_results=max_results,
|
||||
sort_by=arxiv.SortCriterion.Relevance,
|
||||
)
|
||||
search_results = list(arxiv_search.results())
|
||||
logging.debug(f"> Successfully fetched {len(search_results)} paperes")
|
||||
|
||||
if not os.path.exists(papers_dir):
|
||||
os.makedirs(papers_dir)
|
||||
|
||||
paper_lookup = {}
|
||||
for paper in search_results:
|
||||
# Hash filename to avoid bad charaters in file path
|
||||
filename = f"{self._hacky_hash(paper.title)}.pdf"
|
||||
paper_lookup[os.path.join(papers_dir, filename)] = {
|
||||
"Title of this paper": paper.title,
|
||||
"Authors": (", ").join([a.name for a in paper.authors]),
|
||||
"Date published": paper.published.strftime("%m/%d/%Y"),
|
||||
"URL": paper.entry_id,
|
||||
# "summary": paper.summary
|
||||
}
|
||||
paper.download_pdf(dirpath=papers_dir, filename=filename)
|
||||
logging.debug(f"> Downloading {filename}...")
|
||||
|
||||
def get_paper_metadata(filename):
|
||||
return paper_lookup[filename]
|
||||
|
||||
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
|
||||
arxiv_documents = SimpleDirectoryReader(
|
||||
papers_dir, file_metadata=get_paper_metadata
|
||||
).load_data()
|
||||
# Include extra documents containing the abstracts
|
||||
abstract_documents = []
|
||||
for paper in search_results:
|
||||
d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}"
|
||||
abstract_documents.append(Document(d))
|
||||
|
||||
# Delete downloaded papers
|
||||
try:
|
||||
for f in os.listdir(papers_dir):
|
||||
os.remove(os.path.join(papers_dir, f))
|
||||
logging.debug(f"> Deleted file: {f}")
|
||||
os.rmdir(papers_dir)
|
||||
logging.debug(f"> Deleted directory: {papers_dir}")
|
||||
except OSError:
|
||||
print("Unable to delete files or directory")
|
||||
|
||||
return arxiv_documents, abstract_documents
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user