mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-15 04:01:32 +00:00
146 lines
4.9 KiB
Python
146 lines
4.9 KiB
Python
"""Google sheets reader."""
|
|
|
|
import logging
|
|
import os
|
|
from typing import Any, List
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Copyright 2019 Google LLC
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
class GoogleSheetsReader(BaseReader):
|
|
"""Google Sheets reader.
|
|
|
|
Reads a sheet as TSV from Google Sheets
|
|
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
"""Initialize with parameters."""
|
|
try:
|
|
import google # noqa: F401
|
|
import google_auth_oauthlib # noqa: F401
|
|
import googleapiclient # noqa: F401
|
|
except ImportError:
|
|
raise ImportError(
|
|
"`google_auth_oauthlib`, `googleapiclient` and `google` "
|
|
"must be installed to use the GoogleSheetsReader.\n"
|
|
"Please run `pip install --upgrade google-api-python-client "
|
|
"google-auth-httplib2 google-auth-oauthlib`."
|
|
)
|
|
|
|
def load_data(self, spreadsheet_ids: List[str]) -> List[Document]:
|
|
"""Load data from the input directory.
|
|
|
|
Args:
|
|
spreadsheet_ids (List[str]): a list of document ids.
|
|
"""
|
|
if spreadsheet_ids is None:
|
|
raise ValueError('Must specify a "spreadsheet_ids" in `load_kwargs`.')
|
|
|
|
results = []
|
|
for spreadsheet_id in spreadsheet_ids:
|
|
sheet = self._load_sheet(spreadsheet_id)
|
|
results.append(
|
|
Document(sheet, extra_info={"spreadsheet_id": spreadsheet_id})
|
|
)
|
|
return results
|
|
|
|
def _load_sheet(self, spreadsheet_id: str) -> str:
|
|
"""Load a sheet from Google Sheets.
|
|
|
|
Args:
|
|
spreadsheet_id: the sheet id.
|
|
|
|
Returns:
|
|
The sheet data.
|
|
"""
|
|
import googleapiclient.discovery as discovery
|
|
|
|
credentials = self._get_credentials()
|
|
sheets_service = discovery.build("sheets", "v4", credentials=credentials)
|
|
spreadsheet_data = (
|
|
sheets_service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
|
|
)
|
|
sheets = spreadsheet_data.get("sheets")
|
|
sheet_text = ""
|
|
|
|
for sheet in sheets:
|
|
properties = sheet.get("properties")
|
|
title = properties.get("title")
|
|
sheet_text += title + "\n"
|
|
grid_props = properties.get("gridProperties")
|
|
rows = grid_props.get("rowCount")
|
|
cols = grid_props.get("columnCount")
|
|
range_pattern = f"R1C1:R{rows}C{cols}"
|
|
response = (
|
|
sheets_service.spreadsheets()
|
|
.values()
|
|
.get(spreadsheetId=spreadsheet_id, range=range_pattern)
|
|
.execute()
|
|
)
|
|
sheet_text += (
|
|
"\n".join(map(lambda row: "\t".join(row), response.get("values", [])))
|
|
+ "\n"
|
|
)
|
|
return sheet_text
|
|
|
|
def _get_credentials(self) -> Any:
|
|
"""Get valid user credentials from storage.
|
|
|
|
The file token.json stores the user's access and refresh tokens, and is
|
|
created automatically when the authorization flow completes for the first
|
|
time.
|
|
|
|
Returns:
|
|
Credentials, the obtained credential.
|
|
"""
|
|
from google.auth.transport.requests import Request
|
|
from google.oauth2.credentials import Credentials
|
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
|
|
|
creds = None
|
|
if os.path.exists("token.json"):
|
|
creds = Credentials.from_authorized_user_file("token.json", SCOPES)
|
|
# If there are no (valid) credentials available, let the user log in.
|
|
if not creds or not creds.valid:
|
|
if creds and creds.expired and creds.refresh_token:
|
|
creds.refresh(Request())
|
|
else:
|
|
flow = InstalledAppFlow.from_client_secrets_file(
|
|
"credentials.json", SCOPES
|
|
)
|
|
creds = flow.run_local_server(port=0)
|
|
# Save the credentials for the next run
|
|
with open("token.json", "w") as token:
|
|
token.write(creds.to_json())
|
|
|
|
return creds
|
|
|
|
|
|
if __name__ == "__main__":
|
|
reader = GoogleSheetsReader()
|
|
logger.info(
|
|
reader.load_data(
|
|
spreadsheet_ids=["1VkuitKIyNmkoCJJDmEUmkS_VupSkDcztpRhbUzAU5L8"]
|
|
)
|
|
)
|