mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-11-13 16:44:36 +00:00
Merge remote-tracking branch 'upstream/main' into github-reader-test-and-fix
This commit is contained in:
commit
179acb1b7c
7
.gitignore
vendored
7
.gitignore
vendored
@ -1,7 +1,4 @@
|
|||||||
*.egg-info/
|
*.egg-info/
|
||||||
.modules
|
.modules
|
||||||
/**/*/__pycache__/
|
|
||||||
.python-version
|
**/__pycache__/
|
||||||
poetry.lock
|
|
||||||
pyproject.toml
|
|
||||||
.vscode/
|
|
||||||
|
|||||||
20
loader_hub/file/pandas_excel/README.md
Normal file
20
loader_hub/file/pandas_excel/README.md
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# Pandas Excel Loader
|
||||||
|
|
||||||
|
This loader extracts the text from a column of a local .xlsx file using the `pandas` Python package. A single local file is passed in each time you call `load_data`.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use this loader, you need to pass in a `Path` to a local file, along with a `column_name` from where to extract data.
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
from pathlib import Path
|
||||||
|
from llama_index import download_loader
|
||||||
|
|
||||||
|
PandasExcelReader = download_loader("PandasExcelReader")
|
||||||
|
|
||||||
|
loader = PandasExcelReader()
|
||||||
|
documents = loader.load_data(file=Path('./data.xlsx'), column_name="text_column",pandas_config={"sheet_name":"Sheet1"})
|
||||||
|
```
|
||||||
|
|
||||||
|
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
||||||
1
loader_hub/file/pandas_excel/__init__.py
Normal file
1
loader_hub/file/pandas_excel/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
"""Init file."""
|
||||||
57
loader_hub/file/pandas_excel/base.py
Normal file
57
loader_hub/file/pandas_excel/base.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
"""Pandas Excel reader.
|
||||||
|
|
||||||
|
Pandas parser for .xlsx files.
|
||||||
|
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from llama_index.readers.base import BaseReader
|
||||||
|
from llama_index.readers.schema.base import Document
|
||||||
|
|
||||||
|
|
||||||
|
class PandasExcelReader(BaseReader):
|
||||||
|
r"""Pandas-based CSV parser.
|
||||||
|
|
||||||
|
Parses CSVs using the separator detection from Pandas `read_csv`function.
|
||||||
|
If special parameters are required, use the `pandas_config` dict.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
|
||||||
|
pandas_config (dict): Options for the `pandas.read_excel` function call.
|
||||||
|
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
|
||||||
|
for more information. Set to empty dict by default, this means defaults will be used.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
*args: Any,
|
||||||
|
pandas_config: dict = {},
|
||||||
|
**kwargs: Any
|
||||||
|
) -> None:
|
||||||
|
"""Init params."""
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self._pandas_config = pandas_config
|
||||||
|
|
||||||
|
def load_data(
|
||||||
|
self, file: Path, column_name: str, extra_info: Optional[Dict] = None
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Parse file and extract values from a specific column.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file (Path): The path to the Excel file to read.
|
||||||
|
column_name (str): The name of the column to use when creating the Document objects.
|
||||||
|
Returns:
|
||||||
|
List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.
|
||||||
|
"""
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pd.read_excel(file, **self._pandas_config)
|
||||||
|
|
||||||
|
text_list = df[column_name].astype(str).tolist()
|
||||||
|
|
||||||
|
if self._concat_rows:
|
||||||
|
return [Document((self._row_joiner).join(text_list), extra_info=extra_info)]
|
||||||
|
else:
|
||||||
|
return [Document(text, extra_info=extra_info) for text in text_list]
|
||||||
1
loader_hub/file/pandas_excel/requirements.txt
Normal file
1
loader_hub/file/pandas_excel/requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
pandas
|
||||||
@ -88,7 +88,7 @@ class GoogleDocsReader(BaseReader):
|
|||||||
flow = InstalledAppFlow.from_client_secrets_file(
|
flow = InstalledAppFlow.from_client_secrets_file(
|
||||||
"credentials.json", SCOPES
|
"credentials.json", SCOPES
|
||||||
)
|
)
|
||||||
creds = flow.run_local_server(port=0)
|
creds = flow.run_local_server(port=8080)
|
||||||
# Save the credentials for the next run
|
# Save the credentials for the next run
|
||||||
with open("token.json", "w") as token:
|
with open("token.json", "w") as token:
|
||||||
token.write(creds.to_json())
|
token.write(creds.to_json())
|
||||||
|
|||||||
@ -298,6 +298,10 @@
|
|||||||
"author": "alexbowe",
|
"author": "alexbowe",
|
||||||
"keywords": ["readwise", "highlights", "reading", "pkm"]
|
"keywords": ["readwise", "highlights", "reading", "pkm"]
|
||||||
},
|
},
|
||||||
|
"PandasExcelReader": {
|
||||||
|
"id": "file/pandas_excel",
|
||||||
|
"author": "maccarini"
|
||||||
|
},
|
||||||
"ZendeskReader": {
|
"ZendeskReader": {
|
||||||
"id": "zendesk",
|
"id": "zendesk",
|
||||||
"author": "bbornsztein",
|
"author": "bbornsztein",
|
||||||
|
|||||||
@ -15,4 +15,15 @@ loader = ArxivReader()
|
|||||||
documents = loader.load_data(search_query='au:Karpathy')
|
documents = loader.load_data(search_query='au:Karpathy')
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Alternatively, if you would like to load papers and abstracts separately:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from llama_index import download_loader
|
||||||
|
|
||||||
|
ArxivReader = download_loader("ArxivReader")
|
||||||
|
|
||||||
|
loader = ArxivReader()
|
||||||
|
documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy')
|
||||||
|
```
|
||||||
|
|
||||||
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import List, Optional
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from llama_index import download_loader
|
from llama_index import download_loader
|
||||||
from llama_index.readers.base import BaseReader
|
from llama_index.readers.base import BaseReader
|
||||||
@ -93,3 +93,74 @@ class ArxivReader(BaseReader):
|
|||||||
print("Unable to delete files or directory")
|
print("Unable to delete files or directory")
|
||||||
|
|
||||||
return arxiv_documents + abstract_documents
|
return arxiv_documents + abstract_documents
|
||||||
|
|
||||||
|
|
||||||
|
def load_papers_and_abstracts(
|
||||||
|
self,
|
||||||
|
search_query: str,
|
||||||
|
papers_dir: Optional[str] = ".papers",
|
||||||
|
max_results: Optional[int] = 10,
|
||||||
|
) -> Tuple[List[Document], List[Document]]:
|
||||||
|
"""Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
search_query (str): A topic to search for (e.g. "Artificial Intelligence").
|
||||||
|
papers_dir (Optional[str]): Locally directory to store the papers
|
||||||
|
max_results (Optional[int]): Maximum number of papers to fetch.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Document]: A list of Document objects representing the papers themselves
|
||||||
|
List[Document]: A list of Document objects representing abstracts only
|
||||||
|
"""
|
||||||
|
import arxiv
|
||||||
|
|
||||||
|
arxiv_search = arxiv.Search(
|
||||||
|
query=search_query,
|
||||||
|
id_list=[],
|
||||||
|
max_results=max_results,
|
||||||
|
sort_by=arxiv.SortCriterion.Relevance,
|
||||||
|
)
|
||||||
|
search_results = list(arxiv_search.results())
|
||||||
|
logging.debug(f"> Successfully fetched {len(search_results)} paperes")
|
||||||
|
|
||||||
|
if not os.path.exists(papers_dir):
|
||||||
|
os.makedirs(papers_dir)
|
||||||
|
|
||||||
|
paper_lookup = {}
|
||||||
|
for paper in search_results:
|
||||||
|
# Hash filename to avoid bad charaters in file path
|
||||||
|
filename = f"{self._hacky_hash(paper.title)}.pdf"
|
||||||
|
paper_lookup[os.path.join(papers_dir, filename)] = {
|
||||||
|
"Title of this paper": paper.title,
|
||||||
|
"Authors": (", ").join([a.name for a in paper.authors]),
|
||||||
|
"Date published": paper.published.strftime("%m/%d/%Y"),
|
||||||
|
"URL": paper.entry_id,
|
||||||
|
# "summary": paper.summary
|
||||||
|
}
|
||||||
|
paper.download_pdf(dirpath=papers_dir, filename=filename)
|
||||||
|
logging.debug(f"> Downloading {filename}...")
|
||||||
|
|
||||||
|
def get_paper_metadata(filename):
|
||||||
|
return paper_lookup[filename]
|
||||||
|
|
||||||
|
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
|
||||||
|
arxiv_documents = SimpleDirectoryReader(
|
||||||
|
papers_dir, file_metadata=get_paper_metadata
|
||||||
|
).load_data()
|
||||||
|
# Include extra documents containing the abstracts
|
||||||
|
abstract_documents = []
|
||||||
|
for paper in search_results:
|
||||||
|
d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}"
|
||||||
|
abstract_documents.append(Document(d))
|
||||||
|
|
||||||
|
# Delete downloaded papers
|
||||||
|
try:
|
||||||
|
for f in os.listdir(papers_dir):
|
||||||
|
os.remove(os.path.join(papers_dir, f))
|
||||||
|
logging.debug(f"> Deleted file: {f}")
|
||||||
|
os.rmdir(papers_dir)
|
||||||
|
logging.debug(f"> Deleted directory: {papers_dir}")
|
||||||
|
except OSError:
|
||||||
|
print("Unable to delete files or directory")
|
||||||
|
|
||||||
|
return arxiv_documents, abstract_documents
|
||||||
|
|||||||
@ -1,9 +1,5 @@
|
|||||||
certifi==2022.12.7
|
praw~=7.6
|
||||||
charset-normalizer==3.0.1
|
prawcore~=2.3
|
||||||
idna==3.4
|
requests~=2.28
|
||||||
praw==7.6.1
|
update-checker~=0.18
|
||||||
prawcore==2.3.0
|
websocket-client~=1.5
|
||||||
requests==2.28.2
|
|
||||||
update-checker==0.18.0
|
|
||||||
urllib3==1.26.14
|
|
||||||
websocket-client==1.5.1
|
|
||||||
@ -1,2 +1,2 @@
|
|||||||
tqdm==4.64.1
|
tqdm~=4.64
|
||||||
beautifulsoup4==4.11.2
|
beautifulsoup4~=4.11
|
||||||
@ -1 +1 @@
|
|||||||
playwright==1.30.0
|
playwright~=1.30
|
||||||
@ -1 +1 @@
|
|||||||
trafilatura==1.4.1
|
trafilatura~=1.4
|
||||||
@ -1 +1 @@
|
|||||||
wikipedia==1.4.0
|
wikipedia~=1.4
|
||||||
@ -1 +1 @@
|
|||||||
youtube_transcript_api==0.5.0
|
youtube_transcript_api~=0.5.0
|
||||||
Loading…
x
Reference in New Issue
Block a user