diff --git a/loader_hub/papers/arxiv/README.md b/loader_hub/papers/arxiv/README.md index 739536c0..2d85ecb7 100644 --- a/loader_hub/papers/arxiv/README.md +++ b/loader_hub/papers/arxiv/README.md @@ -15,4 +15,15 @@ loader = ArxivReader() documents = loader.load_data(search_query='au:Karpathy') ``` +Alternatively, if you would like to load papers and abstracts separately: + +```python +from llama_index import download_loader + +ArxivReader = download_loader("ArxivReader") + +loader = ArxivReader() +documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy') +``` + This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples. diff --git a/loader_hub/papers/arxiv/base.py b/loader_hub/papers/arxiv/base.py index c21d1bdb..b8cfc2b3 100644 --- a/loader_hub/papers/arxiv/base.py +++ b/loader_hub/papers/arxiv/base.py @@ -2,7 +2,7 @@ import hashlib import logging import os -from typing import List, Optional +from typing import List, Optional, Tuple from llama_index import download_loader from llama_index.readers.base import BaseReader @@ -93,3 +93,74 @@ class ArxivReader(BaseReader): print("Unable to delete files or directory") return arxiv_documents + abstract_documents + + + def load_papers_and_abstracts( + self, + search_query: str, + papers_dir: Optional[str] = ".papers", + max_results: Optional[int] = 10, + ) -> Tuple[List[Document], List[Document]]: + """Search for a topic on Arxiv, download the PDFs of the top results locally, then read them. + + Args: + search_query (str): A topic to search for (e.g. "Artificial Intelligence"). + papers_dir (Optional[str]): Locally directory to store the papers + max_results (Optional[int]): Maximum number of papers to fetch. + + Returns: + List[Document]: A list of Document objects representing the papers themselves + List[Document]: A list of Document objects representing abstracts only + """ + import arxiv + + arxiv_search = arxiv.Search( + query=search_query, + id_list=[], + max_results=max_results, + sort_by=arxiv.SortCriterion.Relevance, + ) + search_results = list(arxiv_search.results()) + logging.debug(f"> Successfully fetched {len(search_results)} paperes") + + if not os.path.exists(papers_dir): + os.makedirs(papers_dir) + + paper_lookup = {} + for paper in search_results: + # Hash filename to avoid bad charaters in file path + filename = f"{self._hacky_hash(paper.title)}.pdf" + paper_lookup[os.path.join(papers_dir, filename)] = { + "Title of this paper": paper.title, + "Authors": (", ").join([a.name for a in paper.authors]), + "Date published": paper.published.strftime("%m/%d/%Y"), + "URL": paper.entry_id, + # "summary": paper.summary + } + paper.download_pdf(dirpath=papers_dir, filename=filename) + logging.debug(f"> Downloading {filename}...") + + def get_paper_metadata(filename): + return paper_lookup[filename] + + SimpleDirectoryReader = download_loader("SimpleDirectoryReader") + arxiv_documents = SimpleDirectoryReader( + papers_dir, file_metadata=get_paper_metadata + ).load_data() + # Include extra documents containing the abstracts + abstract_documents = [] + for paper in search_results: + d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}" + abstract_documents.append(Document(d)) + + # Delete downloaded papers + try: + for f in os.listdir(papers_dir): + os.remove(os.path.join(papers_dir, f)) + logging.debug(f"> Deleted file: {f}") + os.rmdir(papers_dir) + logging.debug(f"> Deleted directory: {papers_dir}") + except OSError: + print("Unable to delete files or directory") + + return arxiv_documents, abstract_documents