From eac86b14feff495ae9dfd65eceee6cced57bd518 Mon Sep 17 00:00:00 2001 From: Bruno Bornsztein Date: Tue, 7 Mar 2023 19:05:40 -0600 Subject: [PATCH 1/5] gmail reader update --- loader_hub/gmail/README.md | 0 loader_hub/gmail/__init__.py | 1 + loader_hub/gmail/base.py | 138 +++++++++++++++++++++++++++++++++++ loader_hub/library.json | 9 ++- 4 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 loader_hub/gmail/README.md create mode 100644 loader_hub/gmail/__init__.py create mode 100644 loader_hub/gmail/base.py diff --git a/loader_hub/gmail/README.md b/loader_hub/gmail/README.md new file mode 100644 index 00000000..e69de29b diff --git a/loader_hub/gmail/__init__.py b/loader_hub/gmail/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/gmail/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/gmail/base.py b/loader_hub/gmail/base.py new file mode 100644 index 00000000..832ad06a --- /dev/null +++ b/loader_hub/gmail/base.py @@ -0,0 +1,138 @@ +"""Google Mail reader.""" +import os +from googleapiclient.discovery import build +import email +from typing import Any, List +from llama_index.readers.base import BaseReader +from llama_index.readers.schema.base import Document +from pydantic import BaseModel +import base64 + +SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"] + + +class GmailReader(BaseReader, BaseModel): + """Gmail reader. + + Reads emails + + """ + query: str = None + max_results: int = 10 + service: Any + + def load_data( + self + ) -> List[Document]: + """Load emails from the user's account + + Args: + number_of_results (Optional[int]): the number of events to return. Defaults to 100. + start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today. + """ + credentials = self._get_credentials() + self.service = build('gmail', 'v1', credentials=credentials) + + messsages = self.search_messages() + + results = [] + for message in messsages: + text = message.pop('body') + extra_info = message + results.append(Document(text, extra_info=extra_info)) + + return results + + def _get_credentials(self) -> Any: + """Get valid user credentials from storage. + + The file token.json stores the user's access and refresh tokens, and is + created automatically when the authorization flow completes for the first + time. + + Returns: + Credentials, the obtained credential. + """ + from google.auth.transport.requests import Request + from google.oauth2.credentials import Credentials + from google_auth_oauthlib.flow import InstalledAppFlow + + creds = None + if os.path.exists("token.json"): + creds = Credentials.from_authorized_user_file("token.json", SCOPES) + # If there are no (valid) credentials available, let the user log in. + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + "credentials.json", SCOPES + ) + creds = flow.run_local_server(port=0) + # Save the credentials for the next run + with open("token.json", "w") as token: + token.write(creds.to_json()) + + return creds + + def search_messages(self): + query = self.query + + max_results = self.max_results + + messages = self.service.users().messages().list( + userId='me', + q=query, + maxResults=int(max_results) + ).execute().get('messages', []) + + result = [] + try: + for message in messages: + message_data = self.get_message_data(message) + if not message_data: + continue + result.append(message_data) + except Exception as e: + raise Exception("Can't get message data" + str(e)) + + return result + + def get_message_data(self, message): + message_id = message['id'] + message_data = self.service.users().messages().get( + format="raw", + userId='me', + id=message_id).execute() + body = self.extract_message_body(message_data) + + if not body: + return None + + return { + "id": message_data['id'], + "threadId": message_data['threadId'], + "snippet": message_data['snippet'], + "body": body, + } + + def extract_message_body(self, message: dict): + from bs4 import BeautifulSoup + try: + body = base64.urlsafe_b64decode(message['raw'].encode('ASCII')) + mime_msg = email.message_from_bytes(body) + + # If the message body contains HTML, parse it with BeautifulSoup + if 'text/html' in mime_msg: + soup = BeautifulSoup(body, 'html.parser') + body = soup.get_text() + return str(body) + except Exception as e: + raise Exception("Can't parse message body" + str(e)) + + +if __name__ == "__main__": + reader = GmailReader(query="from:me after:2023-01-01") + print( + reader.load_data() + ) diff --git a/loader_hub/library.json b/loader_hub/library.json index 15f3a3fa..9b2f5e9b 100644 --- a/loader_hub/library.json +++ b/loader_hub/library.json @@ -324,9 +324,14 @@ "author": "bbornsztein", "keywords": ["wordpress", "blog"] }, - "SteamshipFileReader": { + "GmailReader": { + "id": "gmail", + "author": "bbornsztein", + "keywords": ["gmail", "email"] + }, + "SteamshipFileReader": { "id": "steamship", "author": "douglas-reid", "keywords": ["steamship"] } -} \ No newline at end of file +} From 7c87cdc3caa1f8b28333ed22a532de19878f3e24 Mon Sep 17 00:00:00 2001 From: Bruno Bornsztein Date: Wed, 8 Mar 2023 05:23:52 -0600 Subject: [PATCH 2/5] readme and lazy imports --- loader_hub/gmail/README.md | 68 ++++++++++++++++++++++++++++++++++++++ loader_hub/gmail/base.py | 5 +-- 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/loader_hub/gmail/README.md b/loader_hub/gmail/README.md index e69de29b..1273eb26 100644 --- a/loader_hub/gmail/README.md +++ b/loader_hub/gmail/README.md @@ -0,0 +1,68 @@ +# Gmail Loader + +This loader seaches your Gmail account and parses the resulting emails into `Document`s. The search query can include normal query params, like `from: email@example.com label:inbox`. + +As a prerequisite, you will need to register with Google and generate a `credentials.json` file in the directory where you run this loader. See [here](https://developers.google.com/workspace/guides/create-credentials) for instructions. + +## Usage + +To use this loader, you simply need to pass in a search query string. + +```python +from llama_index import download_loader + +GoogleDocsReader = download_loader('GmailReader') +loader = GoogleDocsReader(query="from: me label:inbox") +documents = loader.load_data() +``` + +## Examples + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. + +### LlamaIndex + +```python +from llama_index import GPTSimpleVectorIndex, download_loader + +GoogleDocsReader = download_loader('GmailReader') +loader = GoogleDocsReader(query="from:me label:sent") + +documents = loader.load_data() + +index = GPTSimpleVectorIndex(documents) +index.query('What did I write about LLMs?') +``` + +### LangChain + +Note: Make sure you change the description of the `Tool` to match your use-case. + +```python +from llama_index import GPTSimpleVectorIndex, download_loader +from langchain.agents import initialize_agent, Tool +from langchain.llms import OpenAI +from langchain.chains.conversation.memory import ConversationBufferMemory + +GoogleDocsReader = download_loader('GmailReader') +loader = GoogleDocsReader(query="from:me label:sent") + +documents = loader.load_data(document_ids=gdoc_ids) + +index = GPTSimpleVectorIndex(documents) + +tools = [ + Tool( + name="Gmail Index", + func=lambda q: index.query(q), + description=f"Useful when you want answer questions about emails.", + ), +] +llm = OpenAI(temperature=0) +memory = ConversationBufferMemory(memory_key="chat_history") +agent_chain = initialize_agent( + tools, llm, agent="zero-shot-react-description", memory=memory +) + +output = agent_chain.run(input="What have I written about AGI?") +``` diff --git a/loader_hub/gmail/base.py b/loader_hub/gmail/base.py index 832ad06a..fa0a31d3 100644 --- a/loader_hub/gmail/base.py +++ b/loader_hub/gmail/base.py @@ -1,6 +1,4 @@ """Google Mail reader.""" -import os -from googleapiclient.discovery import build import email from typing import Any, List from llama_index.readers.base import BaseReader @@ -30,6 +28,8 @@ class GmailReader(BaseReader, BaseModel): number_of_results (Optional[int]): the number of events to return. Defaults to 100. start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today. """ + from googleapiclient.discovery import build + credentials = self._get_credentials() self.service = build('gmail', 'v1', credentials=credentials) @@ -53,6 +53,7 @@ class GmailReader(BaseReader, BaseModel): Returns: Credentials, the obtained credential. """ + import os from google.auth.transport.requests import Request from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import InstalledAppFlow From 564e98c40a28eb5376bd7e3d03d8bbcc91f17979 Mon Sep 17 00:00:00 2001 From: Bruno Bornsztein Date: Wed, 8 Mar 2023 05:25:52 -0600 Subject: [PATCH 3/5] add gmail reader requirements --- loader_hub/gmail/requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 loader_hub/gmail/requirements.txt diff --git a/loader_hub/gmail/requirements.txt b/loader_hub/gmail/requirements.txt new file mode 100644 index 00000000..fcf4511e --- /dev/null +++ b/loader_hub/gmail/requirements.txt @@ -0,0 +1,4 @@ +google-api-python-client +google-auth-httplib2 +google-auth-oauthlib +beautifulsoup4 \ No newline at end of file From 68eb3b3483a40b21bbe768015eeba37fafa665c7 Mon Sep 17 00:00:00 2001 From: Bruno Bornsztein Date: Fri, 10 Mar 2023 06:33:43 -0600 Subject: [PATCH 4/5] fix readme typos --- loader_hub/gmail/README.md | 4 ++-- loader_hub/gmail/base.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/loader_hub/gmail/README.md b/loader_hub/gmail/README.md index 1273eb26..0653f7f1 100644 --- a/loader_hub/gmail/README.md +++ b/loader_hub/gmail/README.md @@ -11,8 +11,8 @@ To use this loader, you simply need to pass in a search query string. ```python from llama_index import download_loader -GoogleDocsReader = download_loader('GmailReader') -loader = GoogleDocsReader(query="from: me label:inbox") +GmailReader = download_loader('GmailReader') +loader = GmailReader(query="from: me label:inbox") documents = loader.load_data() ``` diff --git a/loader_hub/gmail/base.py b/loader_hub/gmail/base.py index fa0a31d3..88c67541 100644 --- a/loader_hub/gmail/base.py +++ b/loader_hub/gmail/base.py @@ -14,6 +14,9 @@ class GmailReader(BaseReader, BaseModel): Reads emails + Args: + query (str): Gmail query. Defaults to None. + max_results (int): Max number of results. Defaults to 10. """ query: str = None max_results: int = 10 @@ -23,15 +26,12 @@ class GmailReader(BaseReader, BaseModel): self ) -> List[Document]: """Load emails from the user's account - - Args: - number_of_results (Optional[int]): the number of events to return. Defaults to 100. - start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today. """ from googleapiclient.discovery import build credentials = self._get_credentials() - self.service = build('gmail', 'v1', credentials=credentials) + if not self.service: + self.service = build('gmail', 'v1', credentials=credentials) messsages = self.search_messages() From 77d5d9473c88afd93cc835112d3886ae5ae1cb03 Mon Sep 17 00:00:00 2001 From: Bruno Bornsztein Date: Sat, 11 Mar 2023 20:20:33 -0600 Subject: [PATCH 5/5] update readme --- loader_hub/gmail/README.md | 47 -------------------------------------- 1 file changed, 47 deletions(-) diff --git a/loader_hub/gmail/README.md b/loader_hub/gmail/README.md index 0653f7f1..b753ff6e 100644 --- a/loader_hub/gmail/README.md +++ b/loader_hub/gmail/README.md @@ -19,50 +19,3 @@ documents = loader.load_data() ## Examples This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. - -### LlamaIndex - -```python -from llama_index import GPTSimpleVectorIndex, download_loader - -GoogleDocsReader = download_loader('GmailReader') -loader = GoogleDocsReader(query="from:me label:sent") - -documents = loader.load_data() - -index = GPTSimpleVectorIndex(documents) -index.query('What did I write about LLMs?') -``` - -### LangChain - -Note: Make sure you change the description of the `Tool` to match your use-case. - -```python -from llama_index import GPTSimpleVectorIndex, download_loader -from langchain.agents import initialize_agent, Tool -from langchain.llms import OpenAI -from langchain.chains.conversation.memory import ConversationBufferMemory - -GoogleDocsReader = download_loader('GmailReader') -loader = GoogleDocsReader(query="from:me label:sent") - -documents = loader.load_data(document_ids=gdoc_ids) - -index = GPTSimpleVectorIndex(documents) - -tools = [ - Tool( - name="Gmail Index", - func=lambda q: index.query(q), - description=f"Useful when you want answer questions about emails.", - ), -] -llm = OpenAI(temperature=0) -memory = ConversationBufferMemory(memory_key="chat_history") -agent_chain = initialize_agent( - tools, llm, agent="zero-shot-react-description", memory=memory -) - -output = agent_chain.run(input="What have I written about AGI?") -```