Merge pull request #97 from reletreby/main

2025-11-09 22:34:33 +00:00 · 2023-03-10 22:37:21 -08:00 · 2023-03-10 22:37:21 -08:00 · 94e5baae7a
commit 94e5baae7a
parent a2ba6618b7 65865098b4
2 changed files with 83 additions and 1 deletions
--- a/loader_hub/papers/arxiv/README.md
+++ b/loader_hub/papers/arxiv/README.md
@ -15,4 +15,15 @@ loader = ArxivReader()
 documents = loader.load_data(search_query='au:Karpathy')
 ```
 Alternatively, if you would like to load papers and abstracts separately:
 ```python
 from llama_index import download_loader
 ArxivReader = download_loader("ArxivReader")
 loader = ArxivReader()
 documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy')
 ```
 This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
--- a/loader_hub/papers/arxiv/base.py
+++ b/loader_hub/papers/arxiv/base.py
@ -2,7 +2,7 @@
 import hashlib
 import logging
 import os
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from llama_index import download_loader
 from llama_index.readers.base import BaseReader
@ -93,3 +93,74 @@ class ArxivReader(BaseReader):
            print("Unable to delete files or directory")
        return arxiv_documents + abstract_documents
    def load_papers_and_abstracts(
        self,
        search_query: str,
        papers_dir: Optional[str] = ".papers",
        max_results: Optional[int] = 10,
    ) -> Tuple[List[Document], List[Document]]:
        """Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.
        Args:
            search_query (str): A topic to search for (e.g. "Artificial Intelligence").
            papers_dir (Optional[str]): Locally directory to store the papers
            max_results (Optional[int]): Maximum number of papers to fetch.
        Returns:
            List[Document]: A list of Document objects representing the papers themselves
            List[Document]: A list of Document objects representing abstracts only
        """
        import arxiv
        arxiv_search = arxiv.Search(
            query=search_query,
            id_list=[],
            max_results=max_results,
            sort_by=arxiv.SortCriterion.Relevance,
        )
        search_results = list(arxiv_search.results())
        logging.debug(f"> Successfully fetched {len(search_results)} paperes")
        if not os.path.exists(papers_dir):
            os.makedirs(papers_dir)
        paper_lookup = {}
        for paper in search_results:
            # Hash filename to avoid bad charaters in file path
            filename = f"{self._hacky_hash(paper.title)}.pdf"
            paper_lookup[os.path.join(papers_dir, filename)] = {
                "Title of this paper": paper.title,
                "Authors": (", ").join([a.name for a in paper.authors]),
                "Date published": paper.published.strftime("%m/%d/%Y"),
                "URL": paper.entry_id,
                # "summary": paper.summary
            }
            paper.download_pdf(dirpath=papers_dir, filename=filename)
            logging.debug(f"> Downloading {filename}...")
        def get_paper_metadata(filename):
            return paper_lookup[filename]
        SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
        arxiv_documents = SimpleDirectoryReader(
            papers_dir, file_metadata=get_paper_metadata
        ).load_data()
        # Include extra documents containing the abstracts
        abstract_documents = []
        for paper in search_results:
            d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}"
            abstract_documents.append(Document(d))
        # Delete downloaded papers
        try:
            for f in os.listdir(papers_dir):
                os.remove(os.path.join(papers_dir, f))
                logging.debug(f"> Deleted file: {f}")
            os.rmdir(papers_dir)
            logging.debug(f"> Deleted directory: {papers_dir}")
        except OSError:
            print("Unable to delete files or directory")
        return arxiv_documents, abstract_documents