From eac86b14feff495ae9dfd65eceee6cced57bd518 Mon Sep 17 00:00:00 2001
From: Bruno Bornsztein <bruno.bornsztein@gmail.com>
Date: Tue, 7 Mar 2023 19:05:40 -0600
Subject: [PATCH 1/5] gmail reader

update
---
 loader_hub/gmail/README.md   |   0
 loader_hub/gmail/__init__.py |   1 +
 loader_hub/gmail/base.py     | 138 +++++++++++++++++++++++++++++++++++
 loader_hub/library.json      |   9 ++-
 4 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 loader_hub/gmail/README.md
 create mode 100644 loader_hub/gmail/__init__.py
 create mode 100644 loader_hub/gmail/base.py

diff --git a/loader_hub/gmail/README.md b/loader_hub/gmail/README.md
new file mode 100644
index 00000000..e69de29b
diff --git a/loader_hub/gmail/__init__.py b/loader_hub/gmail/__init__.py
new file mode 100644
index 00000000..1d464056
--- /dev/null
+++ b/loader_hub/gmail/__init__.py
@@ -0,0 +1 @@
+"""Init file."""
diff --git a/loader_hub/gmail/base.py b/loader_hub/gmail/base.py
new file mode 100644
index 00000000..832ad06a
--- /dev/null
+++ b/loader_hub/gmail/base.py
@@ -0,0 +1,138 @@
+"""Google Mail reader."""
+import os
+from googleapiclient.discovery import build
+import email
+from typing import Any, List
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+from pydantic import BaseModel
+import base64
+
+SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
+
+
+class GmailReader(BaseReader, BaseModel):
+    """Gmail reader.
+
+    Reads emails
+
+    """
+    query: str = None
+    max_results: int = 10
+    service: Any
+
+    def load_data(
+        self
+    ) -> List[Document]:
+        """Load emails from the user's account
+
+        Args:
+            number_of_results (Optional[int]): the number of events to return. Defaults to 100.
+            start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today.
+        """
+        credentials = self._get_credentials()
+        self.service = build('gmail', 'v1', credentials=credentials)
+
+        messsages = self.search_messages()
+
+        results = []
+        for message in messsages:
+            text = message.pop('body')
+            extra_info = message
+            results.append(Document(text, extra_info=extra_info))
+
+        return results
+
+    def _get_credentials(self) -> Any:
+        """Get valid user credentials from storage.
+
+        The file token.json stores the user's access and refresh tokens, and is
+        created automatically when the authorization flow completes for the first
+        time.
+
+        Returns:
+            Credentials, the obtained credential.
+        """
+        from google.auth.transport.requests import Request
+        from google.oauth2.credentials import Credentials
+        from google_auth_oauthlib.flow import InstalledAppFlow
+
+        creds = None
+        if os.path.exists("token.json"):
+            creds = Credentials.from_authorized_user_file("token.json", SCOPES)
+        # If there are no (valid) credentials available, let the user log in.
+        if not creds or not creds.valid:
+            if creds and creds.expired and creds.refresh_token:
+                creds.refresh(Request())
+            else:
+                flow = InstalledAppFlow.from_client_secrets_file(
+                    "credentials.json", SCOPES
+                )
+                creds = flow.run_local_server(port=0)
+            # Save the credentials for the next run
+            with open("token.json", "w") as token:
+                token.write(creds.to_json())
+
+        return creds
+
+    def search_messages(self):
+        query = self.query
+
+        max_results = self.max_results
+
+        messages = self.service.users().messages().list(
+            userId='me',
+            q=query,
+            maxResults=int(max_results)
+        ).execute().get('messages', [])
+
+        result = []
+        try:
+            for message in messages:
+                message_data = self.get_message_data(message)
+                if not message_data:
+                    continue
+                result.append(message_data)
+        except Exception as e:
+            raise Exception("Can't get message data" + str(e))
+
+        return result
+
+    def get_message_data(self, message):
+        message_id = message['id']
+        message_data = self.service.users().messages().get(
+            format="raw",
+            userId='me',
+            id=message_id).execute()
+        body = self.extract_message_body(message_data)
+
+        if not body:
+            return None
+
+        return {
+            "id": message_data['id'],
+            "threadId": message_data['threadId'],
+            "snippet": message_data['snippet'],
+            "body": body,
+        }
+
+    def extract_message_body(self, message: dict):
+        from bs4 import BeautifulSoup
+        try:
+            body = base64.urlsafe_b64decode(message['raw'].encode('ASCII'))
+            mime_msg = email.message_from_bytes(body)
+
+            # If the message body contains HTML, parse it with BeautifulSoup
+            if 'text/html' in mime_msg:
+                soup = BeautifulSoup(body, 'html.parser')
+                body = soup.get_text()
+            return str(body)
+        except Exception as e:
+            raise Exception("Can't parse message body" + str(e))
+
+
+if __name__ == "__main__":
+    reader = GmailReader(query="from:me after:2023-01-01")
+    print(
+        reader.load_data()
+    )
diff --git a/loader_hub/library.json b/loader_hub/library.json
index 15f3a3fa..9b2f5e9b 100644
--- a/loader_hub/library.json
+++ b/loader_hub/library.json
@@ -324,9 +324,14 @@
     "author": "bbornsztein",
     "keywords": ["wordpress", "blog"]
   },
-  "SteamshipFileReader": {
+  "GmailReader": {
+    "id": "gmail",
+    "author": "bbornsztein",
+    "keywords": ["gmail", "email"]
+  },
+ "SteamshipFileReader": {
     "id": "steamship",
     "author": "douglas-reid",
     "keywords": ["steamship"]
   }
-}
\ No newline at end of file
+}

From 7c87cdc3caa1f8b28333ed22a532de19878f3e24 Mon Sep 17 00:00:00 2001
From: Bruno Bornsztein <bruno.bornsztein@gmail.com>
Date: Wed, 8 Mar 2023 05:23:52 -0600
Subject: [PATCH 2/5] readme and lazy imports

---
 loader_hub/gmail/README.md | 68 ++++++++++++++++++++++++++++++++++++++
 loader_hub/gmail/base.py   |  5 +--
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/loader_hub/gmail/README.md b/loader_hub/gmail/README.md
index e69de29b..1273eb26 100644
--- a/loader_hub/gmail/README.md
+++ b/loader_hub/gmail/README.md
@@ -0,0 +1,68 @@
+# Gmail Loader
+
+This loader seaches your Gmail account and parses the resulting emails into `Document`s. The search query can include normal query params, like `from: email@example.com label:inbox`.
+
+As a prerequisite, you will need to register with Google and generate a `credentials.json` file in the directory where you run this loader. See [here](https://developers.google.com/workspace/guides/create-credentials) for instructions.
+
+## Usage
+
+To use this loader, you simply need to pass in a search query string.
+
+```python
+from llama_index import download_loader
+
+GoogleDocsReader = download_loader('GmailReader')
+loader = GoogleDocsReader(query="from: me label:inbox")
+documents = loader.load_data()
+```
+
+## Examples
+
+This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
+
+### LlamaIndex
+
+```python
+from llama_index import GPTSimpleVectorIndex, download_loader
+
+GoogleDocsReader = download_loader('GmailReader')
+loader = GoogleDocsReader(query="from:me label:sent")
+
+documents = loader.load_data()
+
+index = GPTSimpleVectorIndex(documents)
+index.query('What did I write about LLMs?')
+```
+
+### LangChain
+
+Note: Make sure you change the description of the `Tool` to match your use-case.
+
+```python
+from llama_index import GPTSimpleVectorIndex, download_loader
+from langchain.agents import initialize_agent, Tool
+from langchain.llms import OpenAI
+from langchain.chains.conversation.memory import ConversationBufferMemory
+
+GoogleDocsReader = download_loader('GmailReader')
+loader = GoogleDocsReader(query="from:me label:sent")
+
+documents = loader.load_data(document_ids=gdoc_ids)
+
+index = GPTSimpleVectorIndex(documents)
+
+tools = [
+    Tool(
+        name="Gmail Index",
+        func=lambda q: index.query(q),
+        description=f"Useful when you want answer questions about emails.",
+    ),
+]
+llm = OpenAI(temperature=0)
+memory = ConversationBufferMemory(memory_key="chat_history")
+agent_chain = initialize_agent(
+    tools, llm, agent="zero-shot-react-description", memory=memory
+)
+
+output = agent_chain.run(input="What have I written about AGI?")
+```
diff --git a/loader_hub/gmail/base.py b/loader_hub/gmail/base.py
index 832ad06a..fa0a31d3 100644
--- a/loader_hub/gmail/base.py
+++ b/loader_hub/gmail/base.py
@@ -1,6 +1,4 @@
 """Google Mail reader."""
-import os
-from googleapiclient.discovery import build
 import email
 from typing import Any, List
 from llama_index.readers.base import BaseReader
@@ -30,6 +28,8 @@ class GmailReader(BaseReader, BaseModel):
             number_of_results (Optional[int]): the number of events to return. Defaults to 100.
             start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today.
         """
+        from googleapiclient.discovery import build
+
         credentials = self._get_credentials()
         self.service = build('gmail', 'v1', credentials=credentials)
 
@@ -53,6 +53,7 @@ class GmailReader(BaseReader, BaseModel):
         Returns:
             Credentials, the obtained credential.
         """
+        import os
         from google.auth.transport.requests import Request
         from google.oauth2.credentials import Credentials
         from google_auth_oauthlib.flow import InstalledAppFlow

From 564e98c40a28eb5376bd7e3d03d8bbcc91f17979 Mon Sep 17 00:00:00 2001
From: Bruno Bornsztein <bruno.bornsztein@gmail.com>
Date: Wed, 8 Mar 2023 05:25:52 -0600
Subject: [PATCH 3/5] add gmail reader requirements

---
 loader_hub/gmail/requirements.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 loader_hub/gmail/requirements.txt

diff --git a/loader_hub/gmail/requirements.txt b/loader_hub/gmail/requirements.txt
new file mode 100644
index 00000000..fcf4511e
--- /dev/null
+++ b/loader_hub/gmail/requirements.txt
@@ -0,0 +1,4 @@
+google-api-python-client
+google-auth-httplib2
+google-auth-oauthlib
+beautifulsoup4
\ No newline at end of file

From 68eb3b3483a40b21bbe768015eeba37fafa665c7 Mon Sep 17 00:00:00 2001
From: Bruno Bornsztein <bruno.bornsztein@gmail.com>
Date: Fri, 10 Mar 2023 06:33:43 -0600
Subject: [PATCH 4/5] fix readme typos

---
 loader_hub/gmail/README.md |  4 ++--
 loader_hub/gmail/base.py   | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/loader_hub/gmail/README.md b/loader_hub/gmail/README.md
index 1273eb26..0653f7f1 100644
--- a/loader_hub/gmail/README.md
+++ b/loader_hub/gmail/README.md
@@ -11,8 +11,8 @@ To use this loader, you simply need to pass in a search query string.
 ```python
 from llama_index import download_loader
 
-GoogleDocsReader = download_loader('GmailReader')
-loader = GoogleDocsReader(query="from: me label:inbox")
+GmailReader = download_loader('GmailReader')
+loader = GmailReader(query="from: me label:inbox")
 documents = loader.load_data()
 ```
 
diff --git a/loader_hub/gmail/base.py b/loader_hub/gmail/base.py
index fa0a31d3..88c67541 100644
--- a/loader_hub/gmail/base.py
+++ b/loader_hub/gmail/base.py
@@ -14,6 +14,9 @@ class GmailReader(BaseReader, BaseModel):
 
     Reads emails
 
+    Args:
+        query (str): Gmail query. Defaults to None.
+        max_results (int): Max number of results. Defaults to 10.
     """
     query: str = None
     max_results: int = 10
@@ -23,15 +26,12 @@ class GmailReader(BaseReader, BaseModel):
         self
     ) -> List[Document]:
         """Load emails from the user's account
-
-        Args:
-            number_of_results (Optional[int]): the number of events to return. Defaults to 100.
-            start_date (Optional[Union[str, datetime.date]]): the start date to return events from. Defaults to today.
         """
         from googleapiclient.discovery import build
 
         credentials = self._get_credentials()
-        self.service = build('gmail', 'v1', credentials=credentials)
+        if not self.service:
+            self.service = build('gmail', 'v1', credentials=credentials)
 
         messsages = self.search_messages()
 

From 77d5d9473c88afd93cc835112d3886ae5ae1cb03 Mon Sep 17 00:00:00 2001
From: Bruno Bornsztein <bruno.bornsztein@gmail.com>
Date: Sat, 11 Mar 2023 20:20:33 -0600
Subject: [PATCH 5/5] update readme

---
 loader_hub/gmail/README.md | 47 --------------------------------------
 1 file changed, 47 deletions(-)

diff --git a/loader_hub/gmail/README.md b/loader_hub/gmail/README.md
index 0653f7f1..b753ff6e 100644
--- a/loader_hub/gmail/README.md
+++ b/loader_hub/gmail/README.md
@@ -19,50 +19,3 @@ documents = loader.load_data()
 ## Examples
 
 This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
-
-### LlamaIndex
-
-```python
-from llama_index import GPTSimpleVectorIndex, download_loader
-
-GoogleDocsReader = download_loader('GmailReader')
-loader = GoogleDocsReader(query="from:me label:sent")
-
-documents = loader.load_data()
-
-index = GPTSimpleVectorIndex(documents)
-index.query('What did I write about LLMs?')
-```
-
-### LangChain
-
-Note: Make sure you change the description of the `Tool` to match your use-case.
-
-```python
-from llama_index import GPTSimpleVectorIndex, download_loader
-from langchain.agents import initialize_agent, Tool
-from langchain.llms import OpenAI
-from langchain.chains.conversation.memory import ConversationBufferMemory
-
-GoogleDocsReader = download_loader('GmailReader')
-loader = GoogleDocsReader(query="from:me label:sent")
-
-documents = loader.load_data(document_ids=gdoc_ids)
-
-index = GPTSimpleVectorIndex(documents)
-
-tools = [
-    Tool(
-        name="Gmail Index",
-        func=lambda q: index.query(q),
-        description=f"Useful when you want answer questions about emails.",
-    ),
-]
-llm = OpenAI(temperature=0)
-memory = ConversationBufferMemory(memory_key="chat_history")
-agent_chain = initialize_agent(
-    tools, llm, agent="zero-shot-react-description", memory=memory
-)
-
-output = agent_chain.run(input="What have I written about AGI?")
-```