mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-11-03 03:10:09 +00:00
Merge remote-tracking branch 'upstream/main' into github-reader-test-and-fix
This commit is contained in:
commit
179acb1b7c
7
.gitignore
vendored
7
.gitignore
vendored
@ -1,7 +1,4 @@
|
||||
*.egg-info/
|
||||
.modules
|
||||
/**/*/__pycache__/
|
||||
.python-version
|
||||
poetry.lock
|
||||
pyproject.toml
|
||||
.vscode/
|
||||
|
||||
**/__pycache__/
|
||||
|
||||
20
loader_hub/file/pandas_excel/README.md
Normal file
20
loader_hub/file/pandas_excel/README.md
Normal file
@ -0,0 +1,20 @@
|
||||
# Pandas Excel Loader
|
||||
|
||||
This loader extracts the text from a column of a local .xlsx file using the `pandas` Python package. A single local file is passed in each time you call `load_data`.
|
||||
|
||||
## Usage
|
||||
|
||||
To use this loader, you need to pass in a `Path` to a local file, along with a `column_name` from where to extract data.
|
||||
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from llama_index import download_loader
|
||||
|
||||
PandasExcelReader = download_loader("PandasExcelReader")
|
||||
|
||||
loader = PandasExcelReader()
|
||||
documents = loader.load_data(file=Path('./data.xlsx'), column_name="text_column",pandas_config={"sheet_name":"Sheet1"})
|
||||
```
|
||||
|
||||
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
||||
1
loader_hub/file/pandas_excel/__init__.py
Normal file
1
loader_hub/file/pandas_excel/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
57
loader_hub/file/pandas_excel/base.py
Normal file
57
loader_hub/file/pandas_excel/base.py
Normal file
@ -0,0 +1,57 @@
|
||||
"""Pandas Excel reader.
|
||||
|
||||
Pandas parser for .xlsx files.
|
||||
|
||||
"""
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from llama_index.readers.base import BaseReader
|
||||
from llama_index.readers.schema.base import Document
|
||||
|
||||
|
||||
class PandasExcelReader(BaseReader):
|
||||
r"""Pandas-based CSV parser.
|
||||
|
||||
Parses CSVs using the separator detection from Pandas `read_csv`function.
|
||||
If special parameters are required, use the `pandas_config` dict.
|
||||
|
||||
Args:
|
||||
|
||||
pandas_config (dict): Options for the `pandas.read_excel` function call.
|
||||
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
|
||||
for more information. Set to empty dict by default, this means defaults will be used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
pandas_config: dict = {},
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
self._pandas_config = pandas_config
|
||||
|
||||
def load_data(
|
||||
self, file: Path, column_name: str, extra_info: Optional[Dict] = None
|
||||
) -> List[Document]:
|
||||
"""Parse file and extract values from a specific column.
|
||||
|
||||
Args:
|
||||
file (Path): The path to the Excel file to read.
|
||||
column_name (str): The name of the column to use when creating the Document objects.
|
||||
Returns:
|
||||
List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
df = pd.read_excel(file, **self._pandas_config)
|
||||
|
||||
text_list = df[column_name].astype(str).tolist()
|
||||
|
||||
if self._concat_rows:
|
||||
return [Document((self._row_joiner).join(text_list), extra_info=extra_info)]
|
||||
else:
|
||||
return [Document(text, extra_info=extra_info) for text in text_list]
|
||||
1
loader_hub/file/pandas_excel/requirements.txt
Normal file
1
loader_hub/file/pandas_excel/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
pandas
|
||||
@ -88,7 +88,7 @@ class GoogleDocsReader(BaseReader):
|
||||
flow = InstalledAppFlow.from_client_secrets_file(
|
||||
"credentials.json", SCOPES
|
||||
)
|
||||
creds = flow.run_local_server(port=0)
|
||||
creds = flow.run_local_server(port=8080)
|
||||
# Save the credentials for the next run
|
||||
with open("token.json", "w") as token:
|
||||
token.write(creds.to_json())
|
||||
|
||||
@ -298,6 +298,10 @@
|
||||
"author": "alexbowe",
|
||||
"keywords": ["readwise", "highlights", "reading", "pkm"]
|
||||
},
|
||||
"PandasExcelReader": {
|
||||
"id": "file/pandas_excel",
|
||||
"author": "maccarini"
|
||||
},
|
||||
"ZendeskReader": {
|
||||
"id": "zendesk",
|
||||
"author": "bbornsztein",
|
||||
|
||||
@ -15,4 +15,15 @@ loader = ArxivReader()
|
||||
documents = loader.load_data(search_query='au:Karpathy')
|
||||
```
|
||||
|
||||
Alternatively, if you would like to load papers and abstracts separately:
|
||||
|
||||
```python
|
||||
from llama_index import download_loader
|
||||
|
||||
ArxivReader = download_loader("ArxivReader")
|
||||
|
||||
loader = ArxivReader()
|
||||
documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy')
|
||||
```
|
||||
|
||||
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from llama_index import download_loader
|
||||
from llama_index.readers.base import BaseReader
|
||||
@ -93,3 +93,74 @@ class ArxivReader(BaseReader):
|
||||
print("Unable to delete files or directory")
|
||||
|
||||
return arxiv_documents + abstract_documents
|
||||
|
||||
|
||||
def load_papers_and_abstracts(
|
||||
self,
|
||||
search_query: str,
|
||||
papers_dir: Optional[str] = ".papers",
|
||||
max_results: Optional[int] = 10,
|
||||
) -> Tuple[List[Document], List[Document]]:
|
||||
"""Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.
|
||||
|
||||
Args:
|
||||
search_query (str): A topic to search for (e.g. "Artificial Intelligence").
|
||||
papers_dir (Optional[str]): Locally directory to store the papers
|
||||
max_results (Optional[int]): Maximum number of papers to fetch.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects representing the papers themselves
|
||||
List[Document]: A list of Document objects representing abstracts only
|
||||
"""
|
||||
import arxiv
|
||||
|
||||
arxiv_search = arxiv.Search(
|
||||
query=search_query,
|
||||
id_list=[],
|
||||
max_results=max_results,
|
||||
sort_by=arxiv.SortCriterion.Relevance,
|
||||
)
|
||||
search_results = list(arxiv_search.results())
|
||||
logging.debug(f"> Successfully fetched {len(search_results)} paperes")
|
||||
|
||||
if not os.path.exists(papers_dir):
|
||||
os.makedirs(papers_dir)
|
||||
|
||||
paper_lookup = {}
|
||||
for paper in search_results:
|
||||
# Hash filename to avoid bad charaters in file path
|
||||
filename = f"{self._hacky_hash(paper.title)}.pdf"
|
||||
paper_lookup[os.path.join(papers_dir, filename)] = {
|
||||
"Title of this paper": paper.title,
|
||||
"Authors": (", ").join([a.name for a in paper.authors]),
|
||||
"Date published": paper.published.strftime("%m/%d/%Y"),
|
||||
"URL": paper.entry_id,
|
||||
# "summary": paper.summary
|
||||
}
|
||||
paper.download_pdf(dirpath=papers_dir, filename=filename)
|
||||
logging.debug(f"> Downloading {filename}...")
|
||||
|
||||
def get_paper_metadata(filename):
|
||||
return paper_lookup[filename]
|
||||
|
||||
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
|
||||
arxiv_documents = SimpleDirectoryReader(
|
||||
papers_dir, file_metadata=get_paper_metadata
|
||||
).load_data()
|
||||
# Include extra documents containing the abstracts
|
||||
abstract_documents = []
|
||||
for paper in search_results:
|
||||
d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}"
|
||||
abstract_documents.append(Document(d))
|
||||
|
||||
# Delete downloaded papers
|
||||
try:
|
||||
for f in os.listdir(papers_dir):
|
||||
os.remove(os.path.join(papers_dir, f))
|
||||
logging.debug(f"> Deleted file: {f}")
|
||||
os.rmdir(papers_dir)
|
||||
logging.debug(f"> Deleted directory: {papers_dir}")
|
||||
except OSError:
|
||||
print("Unable to delete files or directory")
|
||||
|
||||
return arxiv_documents, abstract_documents
|
||||
|
||||
@ -1,9 +1,5 @@
|
||||
certifi==2022.12.7
|
||||
charset-normalizer==3.0.1
|
||||
idna==3.4
|
||||
praw==7.6.1
|
||||
prawcore==2.3.0
|
||||
requests==2.28.2
|
||||
update-checker==0.18.0
|
||||
urllib3==1.26.14
|
||||
websocket-client==1.5.1
|
||||
praw~=7.6
|
||||
prawcore~=2.3
|
||||
requests~=2.28
|
||||
update-checker~=0.18
|
||||
websocket-client~=1.5
|
||||
@ -1,2 +1,2 @@
|
||||
tqdm==4.64.1
|
||||
beautifulsoup4==4.11.2
|
||||
tqdm~=4.64
|
||||
beautifulsoup4~=4.11
|
||||
@ -1,3 +1,3 @@
|
||||
beautifulsoup4
|
||||
requests
|
||||
urllib3
|
||||
urllib3
|
||||
@ -1 +1 @@
|
||||
playwright==1.30.0
|
||||
playwright~=1.30
|
||||
@ -1 +1 @@
|
||||
trafilatura==1.4.1
|
||||
trafilatura~=1.4
|
||||
@ -1 +1 @@
|
||||
wikipedia==1.4.0
|
||||
wikipedia~=1.4
|
||||
@ -1 +1 @@
|
||||
youtube_transcript_api==0.5.0
|
||||
youtube_transcript_api~=0.5.0
|
||||
Loading…
x
Reference in New Issue
Block a user