Merge remote-tracking branch 'upstream/main' into github-reader-test-and-fix

2025-11-03 03:10:09 +00:00 · 2023-03-11 14:35:39 -05:00 · 2023-03-11 14:35:39 -05:00 · 179acb1b7c
commit 179acb1b7c
parent ab63daf7ae 94e5baae7a
16 changed files with 181 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,4 @@
 *.egg-info/
 .modules
-/**/*/__pycache__/
-.python-version
-poetry.lock
-pyproject.toml
-.vscode/
+
+**/__pycache__/
--- a/loader_hub/file/pandas_excel/README.md
+++ b/loader_hub/file/pandas_excel/README.md
@ -0,0 +1,20 @@
+# Pandas Excel Loader
+
+This loader extracts the text from a column of a local .xlsx file using the `pandas` Python package. A single local file is passed in each time you call `load_data`.
+
+## Usage
+
+To use this loader, you need to pass in a `Path` to a local file, along with a `column_name` from where to extract data.
+
+
+```python
+from pathlib import Path
+from llama_index import download_loader
+
+PandasExcelReader = download_loader("PandasExcelReader")
+
+loader = PandasExcelReader()
+documents = loader.load_data(file=Path('./data.xlsx'), column_name="text_column",pandas_config={"sheet_name":"Sheet1"})
+```
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
--- a/loader_hub/file/pandas_excel/init.py
+++ b/loader_hub/file/pandas_excel/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/file/pandas_excel/base.py
+++ b/loader_hub/file/pandas_excel/base.py
@ -0,0 +1,57 @@
+"""Pandas Excel reader.
+
+Pandas parser for .xlsx files.
+
+"""
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+
+class PandasExcelReader(BaseReader):
+    r"""Pandas-based CSV parser.
+
+    Parses CSVs using the separator detection from Pandas `read_csv`function.
+    If special parameters are required, use the `pandas_config` dict.
+
+    Args:
+
+        pandas_config (dict): Options for the `pandas.read_excel` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
+            for more information. Set to empty dict by default, this means defaults will be used.
+
+    """
+
+    def __init__(
+        self,
+        *args: Any,
+        pandas_config: dict = {},
+        **kwargs: Any
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._pandas_config = pandas_config
+
+    def load_data(
+        self, file: Path, column_name: str, extra_info: Optional[Dict] = None
+    ) -> List[Document]:
+        """Parse file and extract values from a specific column.
+
+        Args:
+            file (Path): The path to the Excel file to read.
+            column_name (str): The name of the column to use when creating the Document objects.
+        Returns:
+            List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.
+        """
+        import pandas as pd
+
+        df = pd.read_excel(file, **self._pandas_config)
+
+        text_list = df[column_name].astype(str).tolist()
+
+        if self._concat_rows:
+            return [Document((self._row_joiner).join(text_list), extra_info=extra_info)]
+        else:
+            return [Document(text, extra_info=extra_info) for text in text_list]
--- a/loader_hub/file/pandas_excel/requirements.txt
+++ b/loader_hub/file/pandas_excel/requirements.txt
@ -0,0 +1 @@
+pandas
--- a/loader_hub/google_docs/base.py
+++ b/loader_hub/google_docs/base.py
@ -88,7 +88,7 @@ class GoogleDocsReader(BaseReader):
                flow = InstalledAppFlow.from_client_secrets_file(
                    "credentials.json", SCOPES
                )
-                creds = flow.run_local_server(port=0)
+                creds = flow.run_local_server(port=8080)
            # Save the credentials for the next run
            with open("token.json", "w") as token:
                token.write(creds.to_json())
--- a/loader_hub/library.json
+++ b/loader_hub/library.json
@ -298,6 +298,10 @@
    "author": "alexbowe",
    "keywords": ["readwise", "highlights", "reading", "pkm"]
  },
+  "PandasExcelReader": {
+    "id": "file/pandas_excel",
+    "author": "maccarini"
+  },
  "ZendeskReader": {
    "id": "zendesk",
    "author": "bbornsztein",
--- a/loader_hub/papers/arxiv/README.md
+++ b/loader_hub/papers/arxiv/README.md
@ -15,4 +15,15 @@ loader = ArxivReader()
 documents = loader.load_data(search_query='au:Karpathy')
 ```

+Alternatively, if you would like to load papers and abstracts separately:
+
+```python
+from llama_index import download_loader
+
+ArxivReader = download_loader("ArxivReader")
+
+loader = ArxivReader()
+documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy')
+```
+
 This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
--- a/loader_hub/papers/arxiv/base.py
+++ b/loader_hub/papers/arxiv/base.py
@ -2,7 +2,7 @@
 import hashlib
 import logging
 import os
-from typing import List, Optional
+from typing import List, Optional, Tuple

 from llama_index import download_loader
 from llama_index.readers.base import BaseReader
@ -93,3 +93,74 @@ class ArxivReader(BaseReader):
            print("Unable to delete files or directory")

        return arxiv_documents + abstract_documents
+    
+    
+    def load_papers_and_abstracts(
+        self,
+        search_query: str,
+        papers_dir: Optional[str] = ".papers",
+        max_results: Optional[int] = 10,
+    ) -> Tuple[List[Document], List[Document]]:
+        """Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.
+
+        Args:
+            search_query (str): A topic to search for (e.g. "Artificial Intelligence").
+            papers_dir (Optional[str]): Locally directory to store the papers
+            max_results (Optional[int]): Maximum number of papers to fetch.
+
+        Returns:
+            List[Document]: A list of Document objects representing the papers themselves
+            List[Document]: A list of Document objects representing abstracts only
+        """
+        import arxiv
+
+        arxiv_search = arxiv.Search(
+            query=search_query,
+            id_list=[],
+            max_results=max_results,
+            sort_by=arxiv.SortCriterion.Relevance,
+        )
+        search_results = list(arxiv_search.results())
+        logging.debug(f"> Successfully fetched {len(search_results)} paperes")
+
+        if not os.path.exists(papers_dir):
+            os.makedirs(papers_dir)
+
+        paper_lookup = {}
+        for paper in search_results:
+            # Hash filename to avoid bad charaters in file path
+            filename = f"{self._hacky_hash(paper.title)}.pdf"
+            paper_lookup[os.path.join(papers_dir, filename)] = {
+                "Title of this paper": paper.title,
+                "Authors": (", ").join([a.name for a in paper.authors]),
+                "Date published": paper.published.strftime("%m/%d/%Y"),
+                "URL": paper.entry_id,
+                # "summary": paper.summary
+            }
+            paper.download_pdf(dirpath=papers_dir, filename=filename)
+            logging.debug(f"> Downloading {filename}...")
+
+        def get_paper_metadata(filename):
+            return paper_lookup[filename]
+
+        SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
+        arxiv_documents = SimpleDirectoryReader(
+            papers_dir, file_metadata=get_paper_metadata
+        ).load_data()
+        # Include extra documents containing the abstracts
+        abstract_documents = []
+        for paper in search_results:
+            d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}"
+            abstract_documents.append(Document(d))
+
+        # Delete downloaded papers
+        try:
+            for f in os.listdir(papers_dir):
+                os.remove(os.path.join(papers_dir, f))
+                logging.debug(f"> Deleted file: {f}")
+            os.rmdir(papers_dir)
+            logging.debug(f"> Deleted directory: {papers_dir}")
+        except OSError:
+            print("Unable to delete files or directory")
+
+        return arxiv_documents, abstract_documents
--- a/loader_hub/reddit/requirements.txt
+++ b/loader_hub/reddit/requirements.txt
@ -1,9 +1,5 @@
-certifi==2022.12.7
-charset-normalizer==3.0.1
-idna==3.4
-praw==7.6.1
-prawcore==2.3.0
-requests==2.28.2
-update-checker==0.18.0
-urllib3==1.26.14
-websocket-client==1.5.1
+praw~=7.6
+prawcore~=2.3
+requests~=2.28
+update-checker~=0.18
+websocket-client~=1.5
--- a/loader_hub/remote_depth/requirements.txt
+++ b/loader_hub/remote_depth/requirements.txt
@ -1,2 +1,2 @@
-tqdm==4.64.1
-beautifulsoup4==4.11.2
+tqdm~=4.64
+beautifulsoup4~=4.11
--- a/loader_hub/web/beautiful_soup_web/requirements.txt
+++ b/loader_hub/web/beautiful_soup_web/requirements.txt
@ -1,3 +1,3 @@
 beautifulsoup4
 requests
-urllib3
+urllib3
--- a/loader_hub/web/knowledge_base/requirements.txt
+++ b/loader_hub/web/knowledge_base/requirements.txt
@ -1 +1 @@
-playwright==1.30.0
+playwright~=1.30
--- a/loader_hub/web/trafilatura_web/requirements.txt
+++ b/loader_hub/web/trafilatura_web/requirements.txt
@ -1 +1 @@
-trafilatura==1.4.1
+trafilatura~=1.4
--- a/loader_hub/wikipedia/requirements.txt
+++ b/loader_hub/wikipedia/requirements.txt
@ -1 +1 @@
-wikipedia==1.4.0
+wikipedia~=1.4
--- a/loader_hub/youtube_transcript/requirements.txt
+++ b/loader_hub/youtube_transcript/requirements.txt
@ -1 +1 @@
-youtube_transcript_api==0.5.0
+youtube_transcript_api~=0.5.0