Merge remote-tracking branch 'upstream/main' into github-reader-test-and-fix

This commit is contained in:
ahmetkca 2023-03-11 14:35:39 -05:00
commit 179acb1b7c
16 changed files with 181 additions and 23 deletions

7
.gitignore vendored
View File

@ -1,7 +1,4 @@
*.egg-info/
.modules
/**/*/__pycache__/
.python-version
poetry.lock
pyproject.toml
.vscode/
**/__pycache__/

View File

@ -0,0 +1,20 @@
# Pandas Excel Loader
This loader extracts the text from a column of a local .xlsx file using the `pandas` Python package. A single local file is passed in each time you call `load_data`.
## Usage
To use this loader, you need to pass in a `Path` to a local file, along with a `column_name` from where to extract data.
```python
from pathlib import Path
from llama_index import download_loader
PandasExcelReader = download_loader("PandasExcelReader")
loader = PandasExcelReader()
documents = loader.load_data(file=Path('./data.xlsx'), column_name="text_column",pandas_config={"sheet_name":"Sheet1"})
```
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.

View File

@ -0,0 +1 @@
"""Init file."""

View File

@ -0,0 +1,57 @@
"""Pandas Excel reader.
Pandas parser for .xlsx files.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class PandasExcelReader(BaseReader):
r"""Pandas-based CSV parser.
Parses CSVs using the separator detection from Pandas `read_csv`function.
If special parameters are required, use the `pandas_config` dict.
Args:
pandas_config (dict): Options for the `pandas.read_excel` function call.
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
for more information. Set to empty dict by default, this means defaults will be used.
"""
def __init__(
self,
*args: Any,
pandas_config: dict = {},
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._pandas_config = pandas_config
def load_data(
self, file: Path, column_name: str, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file and extract values from a specific column.
Args:
file (Path): The path to the Excel file to read.
column_name (str): The name of the column to use when creating the Document objects.
Returns:
List[Document]: A list of`Document objects containing the values from the specified column in the Excel file.
"""
import pandas as pd
df = pd.read_excel(file, **self._pandas_config)
text_list = df[column_name].astype(str).tolist()
if self._concat_rows:
return [Document((self._row_joiner).join(text_list), extra_info=extra_info)]
else:
return [Document(text, extra_info=extra_info) for text in text_list]

View File

@ -0,0 +1 @@
pandas

View File

@ -88,7 +88,7 @@ class GoogleDocsReader(BaseReader):
flow = InstalledAppFlow.from_client_secrets_file(
"credentials.json", SCOPES
)
creds = flow.run_local_server(port=0)
creds = flow.run_local_server(port=8080)
# Save the credentials for the next run
with open("token.json", "w") as token:
token.write(creds.to_json())

View File

@ -298,6 +298,10 @@
"author": "alexbowe",
"keywords": ["readwise", "highlights", "reading", "pkm"]
},
"PandasExcelReader": {
"id": "file/pandas_excel",
"author": "maccarini"
},
"ZendeskReader": {
"id": "zendesk",
"author": "bbornsztein",

View File

@ -15,4 +15,15 @@ loader = ArxivReader()
documents = loader.load_data(search_query='au:Karpathy')
```
Alternatively, if you would like to load papers and abstracts separately:
```python
from llama_index import download_loader
ArxivReader = download_loader("ArxivReader")
loader = ArxivReader()
documents, abstracts = loader.load_papers_and_abstracts(search_query='au:Karpathy')
```
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.

View File

@ -2,7 +2,7 @@
import hashlib
import logging
import os
from typing import List, Optional
from typing import List, Optional, Tuple
from llama_index import download_loader
from llama_index.readers.base import BaseReader
@ -93,3 +93,74 @@ class ArxivReader(BaseReader):
print("Unable to delete files or directory")
return arxiv_documents + abstract_documents
def load_papers_and_abstracts(
self,
search_query: str,
papers_dir: Optional[str] = ".papers",
max_results: Optional[int] = 10,
) -> Tuple[List[Document], List[Document]]:
"""Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.
Args:
search_query (str): A topic to search for (e.g. "Artificial Intelligence").
papers_dir (Optional[str]): Locally directory to store the papers
max_results (Optional[int]): Maximum number of papers to fetch.
Returns:
List[Document]: A list of Document objects representing the papers themselves
List[Document]: A list of Document objects representing abstracts only
"""
import arxiv
arxiv_search = arxiv.Search(
query=search_query,
id_list=[],
max_results=max_results,
sort_by=arxiv.SortCriterion.Relevance,
)
search_results = list(arxiv_search.results())
logging.debug(f"> Successfully fetched {len(search_results)} paperes")
if not os.path.exists(papers_dir):
os.makedirs(papers_dir)
paper_lookup = {}
for paper in search_results:
# Hash filename to avoid bad charaters in file path
filename = f"{self._hacky_hash(paper.title)}.pdf"
paper_lookup[os.path.join(papers_dir, filename)] = {
"Title of this paper": paper.title,
"Authors": (", ").join([a.name for a in paper.authors]),
"Date published": paper.published.strftime("%m/%d/%Y"),
"URL": paper.entry_id,
# "summary": paper.summary
}
paper.download_pdf(dirpath=papers_dir, filename=filename)
logging.debug(f"> Downloading {filename}...")
def get_paper_metadata(filename):
return paper_lookup[filename]
SimpleDirectoryReader = download_loader("SimpleDirectoryReader")
arxiv_documents = SimpleDirectoryReader(
papers_dir, file_metadata=get_paper_metadata
).load_data()
# Include extra documents containing the abstracts
abstract_documents = []
for paper in search_results:
d = f"The following is a summary of the paper: {paper.title}\n\nSummary: {paper.summary}"
abstract_documents.append(Document(d))
# Delete downloaded papers
try:
for f in os.listdir(papers_dir):
os.remove(os.path.join(papers_dir, f))
logging.debug(f"> Deleted file: {f}")
os.rmdir(papers_dir)
logging.debug(f"> Deleted directory: {papers_dir}")
except OSError:
print("Unable to delete files or directory")
return arxiv_documents, abstract_documents

View File

@ -1,9 +1,5 @@
certifi==2022.12.7
charset-normalizer==3.0.1
idna==3.4
praw==7.6.1
prawcore==2.3.0
requests==2.28.2
update-checker==0.18.0
urllib3==1.26.14
websocket-client==1.5.1
praw~=7.6
prawcore~=2.3
requests~=2.28
update-checker~=0.18
websocket-client~=1.5

View File

@ -1,2 +1,2 @@
tqdm==4.64.1
beautifulsoup4==4.11.2
tqdm~=4.64
beautifulsoup4~=4.11

View File

@ -1,3 +1,3 @@
beautifulsoup4
requests
urllib3
urllib3

View File

@ -1 +1 @@
playwright==1.30.0
playwright~=1.30

View File

@ -1 +1 @@
trafilatura==1.4.1
trafilatura~=1.4

View File

@ -1 +1 @@
wikipedia==1.4.0
wikipedia~=1.4

View File

@ -1 +1 @@
youtube_transcript_api==0.5.0
youtube_transcript_api~=0.5.0