Merge remote-tracking branch 'upstream/main' into github-reader-test-and-fix

2025-11-13 16:44:36 +00:00 · 2023-03-11 14:35:39 -05:00 · 2023-03-11 14:35:39 -05:00 · 179acb1b7c
commit 179acb1b7c
parent ab63daf7ae 94e5baae7a
16 changed files with 181 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,4 @@
 *.egg-info/
 .modules
-/**/*/__pycache__/
+
-.python-version
+**/__pycache__/
 poetry.lock
 pyproject.toml
 .vscode/
--- a/loader_hub/file/pandas_excel/README.md
+++ b/loader_hub/file/pandas_excel/README.md
@ -0,0 +1,20 @@
 # Pandas Excel Loader
 This loader extracts the text from a column of a local .xlsx file using the `pandas` Python package. A single local file is passed in each time you call `load_data`.
 ## Usage
 To use this loader, you need to pass in a `Path` to a local file, along with a `column_name` from where to extract data.
 ```python
 from pathlib import Path
 from llama_index import download_loader
 PandasExcelReader = download_loader("PandasExcelReader")
 loader = PandasExcelReader()
 documents = loader.load_data(file=Path('./data.xlsx'), column_name="text_column",pandas_config={"sheet_name":"Sheet1"})
 ```
 This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
--- a/loader_hub/file/pandas_excel/init.py
+++ b/loader_hub/file/pandas_excel/init.py
@ -0,0 +1 @@
 """Init file."""
--- a/loader_hub/file/pandas_excel/base.py
+++ b/loader_hub/file/pandas_excel/base.py
@ -0,0 +1,57 @@
 """Pandas Excel reader.
 Pandas parser for .xlsx files.
 """
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 from llama_index.readers.base import BaseReader
 from llama_index.readers.schema.base import Document
 class PandasExcelReader(BaseReader):
    r"""Pandas-based CSV parser.
    Parses CSVs using the separator detection from Pandas `read_csv`function.
    If special parameters are required, use the `pandas_config` dict.
    Args:
        pandas_config (dict): Options for the `pandas.read_excel` function call.
            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
            for more information. Set to empty dict by default, this means defaults will be used.
    """
    def __init__(
        self,
        *args: Any,
        pandas_config: dict = {},
        **kwargs: Any
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
        self._pandas_config = pandas_config
    def load_data(
        self, file: Path, column_name: str, extra_info: Optional[Dict] = None
    ) -> List[Document]:
        """Parse file and extract values from a specific column.
        Args:
            file (Path): The path to the Excel file to read.
            column_name (str): The name of the column to use when creating the Document objects.
        Returns:
            List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.
        """
        import pandas as pd
        df = pd.read_excel(file, **self._pandas_config)
        text_list = df[column_name].astype(str).tolist()
        if self._concat_rows:
            return [Document((self._row_joiner).join(text_list), extra_info=extra_info)]
        else:
            return [Document(text, extra_info=extra_info) for text in text_list]
--- a/loader_hub/file/pandas_excel/requirements.txt
+++ b/loader_hub/file/pandas_excel/requirements.txt
@ -0,0 +1 @@
 pandas
--- a/loader_hub/google_docs/base.py
+++ b/loader_hub/google_docs/base.py
@ -88,7 +88,7 @@ class GoogleDocsReader(BaseReader):
                flow = InstalledAppFlow.from_client_secrets_file(
                    "credentials.json", SCOPES
                )
-                creds = flow.run_local_server(port=0)
+                creds = flow.run_local_server(port=8080)
            # Save the credentials for the next run
            with open("token.json", "w") as token:
                token.write(creds.to_json())
--- a/loader_hub/library.json
+++ b/loader_hub/library.json
@ -298,6 +298,10 @@
    "author": "alexbowe",
    "keywords": ["readwise", "highlights", "reading", "pkm"]
  },
  "PandasExcelReader": {
    "id": "file/pandas_excel",
    "author": "maccarini"
  },
  "ZendeskReader": {
    "id": "zendesk",
    "author": "bbornsztein",
--- a/loader_hub/papers/arxiv/README.md
+++ b/loader_hub/papers/arxiv/README.md
@ -15,4 +15,15 @@ loader = ArxivReader()
 documents = loader.load_data(search_query='au:Karpathy')
 ```
 Alternatively, if you would like to load papers and abstracts separately:
 ```python
 from llama_index import download_loader
 ArxivReader = download_loader("ArxivReader")
 loader = ArxivReader()
 documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy')
 ```
 This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
--- a/loader_hub/papers/arxiv/base.py
+++ b/loader_hub/papers/arxiv/base.py
@ -2,7 +2,7 @@
 import hashlib
 import logging
 import os
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from llama_index import download_loader
 from llama_index.readers.base import BaseReader
@ -93,3 +93,74 @@ class ArxivReader(BaseReader):
            print("Unable to delete files or directory")
        return arxiv_documents + abstract_documents
    def load_papers_and_abstracts(
        self,
        search_query: str,
        papers_dir: Optional[str] = ".papers",
        max_results: Optional[int] = 10,
    ) -> Tuple[List[Document], List[Document]]:
        """Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.
        Args:
            search_query (str): A topic to search for (e.g. "Artificial Intelligence").
            papers_dir (Optional[str]): Locally directory to store the papers
            max_results (Optional[int]): Maximum number of papers to fetch.
        Returns:
            List[Document]: A list of Document objects representing the papers themselves
            List[Document]: A list of Document objects representing abstracts only
        """
        import arxiv
        arxiv_search = arxiv.Search(
            query=search_query,
            id_list=[],
            max_results=max_results,
            sort_by=arxiv.SortCriterion.Relevance,
        )
        search_results = list(arxiv_search.results())
        logging.debug(f"> Successfully fetched {len(search_results)} paperes")
        if not os.path.exists(papers_dir):
            os.makedirs(papers_dir)
        paper_lookup = {}
        for paper in search_results:
            # Hash filename to avoid bad charaters in file path
            filename = f"{self._hacky_hash(paper.title)}.pdf"
            paper_lookup[os.path.join(papers_dir, filename)] = {
                "Title of this paper": paper.title,
                "Authors": (", ").join([a.name for a in paper.authors]),
                "Date published": paper.published.strftime("%m/%d/%Y"),
                "URL": paper.entry_id,
                # "summary": paper.summary
            }
            paper.download_pdf(dirpath=papers_dir, filename=filename)
            logging.debug(f"> Downloading {filename}...")
        def get_paper_metadata(filename):
            return paper_lookup[filename]
        SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
        arxiv_documents = SimpleDirectoryReader(
            papers_dir, file_metadata=get_paper_metadata
        ).load_data()
        # Include extra documents containing the abstracts
        abstract_documents = []
        for paper in search_results:
            d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}"
            abstract_documents.append(Document(d))
        # Delete downloaded papers
        try:
            for f in os.listdir(papers_dir):
                os.remove(os.path.join(papers_dir, f))
                logging.debug(f"> Deleted file: {f}")
            os.rmdir(papers_dir)
            logging.debug(f"> Deleted directory: {papers_dir}")
        except OSError:
            print("Unable to delete files or directory")
        return arxiv_documents, abstract_documents
--- a/loader_hub/reddit/requirements.txt
+++ b/loader_hub/reddit/requirements.txt
@ -1,9 +1,5 @@
-certifi==2022.12.7
+praw~=7.6
-charset-normalizer==3.0.1
+prawcore~=2.3
-idna==3.4
+requests~=2.28
-praw==7.6.1
+update-checker~=0.18
-prawcore==2.3.0
+websocket-client~=1.5
 requests==2.28.2
 update-checker==0.18.0
 urllib3==1.26.14
 websocket-client==1.5.1
--- a/loader_hub/remote_depth/requirements.txt
+++ b/loader_hub/remote_depth/requirements.txt
@ -1,2 +1,2 @@
-tqdm==4.64.1
+tqdm~=4.64
-beautifulsoup4==4.11.2
+beautifulsoup4~=4.11
--- a/loader_hub/web/knowledge_base/requirements.txt
+++ b/loader_hub/web/knowledge_base/requirements.txt
@ -1 +1 @@
-playwright==1.30.0
+playwright~=1.30
--- a/loader_hub/web/trafilatura_web/requirements.txt
+++ b/loader_hub/web/trafilatura_web/requirements.txt
@ -1 +1 @@
-trafilatura==1.4.1
+trafilatura~=1.4
--- a/loader_hub/wikipedia/requirements.txt
+++ b/loader_hub/wikipedia/requirements.txt
@ -1 +1 @@
-wikipedia==1.4.0
+wikipedia~=1.4
--- a/loader_hub/youtube_transcript/requirements.txt
+++ b/loader_hub/youtube_transcript/requirements.txt
@ -1 +1 @@
-youtube_transcript_api==0.5.0
+youtube_transcript_api~=0.5.0
`@ -1,2 +1,2 @@`
	`tqdm==4.64.1`	`tqdm~=4.64`
	`beautifulsoup4==4.11.2`	`beautifulsoup4~=4.11`
		`@ -1 +1 @@`
			`youtube_transcript_api==0.5.0`				`youtube_transcript_api~=0.5.0`