diff --git a/loader_hub/gmail/README.md b/loader_hub/gmail/README.md new file mode 100644 index 00000000..b753ff6e --- /dev/null +++ b/loader_hub/gmail/README.md @@ -0,0 +1,21 @@ +# Gmail Loader + +This loader seaches your Gmail account and parses the resulting emails into `Document`s. The search query can include normal query params, like `from: email@example.com label:inbox`. + +As a prerequisite, you will need to register with Google and generate a `credentials.json` file in the directory where you run this loader. See [here](https://developers.google.com/workspace/guides/create-credentials) for instructions. + +## Usage + +To use this loader, you simply need to pass in a search query string. + +```python +from llama_index import download_loader + +GmailReader = download_loader('GmailReader') +loader = GmailReader(query="from: me label:inbox") +documents = loader.load_data() +``` + +## Examples + +This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. diff --git a/loader_hub/gmail/__init__.py b/loader_hub/gmail/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/gmail/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/gmail/base.py b/loader_hub/gmail/base.py new file mode 100644 index 00000000..88c67541 --- /dev/null +++ b/loader_hub/gmail/base.py @@ -0,0 +1,139 @@ +"""Google Mail reader.""" +import email +from typing import Any, List +from llama_index.readers.base import BaseReader +from llama_index.readers.schema.base import Document +from pydantic import BaseModel +import base64 + +SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"] + + +class GmailReader(BaseReader, BaseModel): + """Gmail reader. + + Reads emails + + Args: + query (str): Gmail query. Defaults to None. + max_results (int): Max number of results. Defaults to 10. + """ + query: str = None + max_results: int = 10 + service: Any + + def load_data( + self + ) -> List[Document]: + """Load emails from the user's account + """ + from googleapiclient.discovery import build + + credentials = self._get_credentials() + if not self.service: + self.service = build('gmail', 'v1', credentials=credentials) + + messsages = self.search_messages() + + results = [] + for message in messsages: + text = message.pop('body') + extra_info = message + results.append(Document(text, extra_info=extra_info)) + + return results + + def _get_credentials(self) -> Any: + """Get valid user credentials from storage. + + The file token.json stores the user's access and refresh tokens, and is + created automatically when the authorization flow completes for the first + time. + + Returns: + Credentials, the obtained credential. + """ + import os + from google.auth.transport.requests import Request + from google.oauth2.credentials import Credentials + from google_auth_oauthlib.flow import InstalledAppFlow + + creds = None + if os.path.exists("token.json"): + creds = Credentials.from_authorized_user_file("token.json", SCOPES) + # If there are no (valid) credentials available, let the user log in. + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + "credentials.json", SCOPES + ) + creds = flow.run_local_server(port=0) + # Save the credentials for the next run + with open("token.json", "w") as token: + token.write(creds.to_json()) + + return creds + + def search_messages(self): + query = self.query + + max_results = self.max_results + + messages = self.service.users().messages().list( + userId='me', + q=query, + maxResults=int(max_results) + ).execute().get('messages', []) + + result = [] + try: + for message in messages: + message_data = self.get_message_data(message) + if not message_data: + continue + result.append(message_data) + except Exception as e: + raise Exception("Can't get message data" + str(e)) + + return result + + def get_message_data(self, message): + message_id = message['id'] + message_data = self.service.users().messages().get( + format="raw", + userId='me', + id=message_id).execute() + body = self.extract_message_body(message_data) + + if not body: + return None + + return { + "id": message_data['id'], + "threadId": message_data['threadId'], + "snippet": message_data['snippet'], + "body": body, + } + + def extract_message_body(self, message: dict): + from bs4 import BeautifulSoup + try: + body = base64.urlsafe_b64decode(message['raw'].encode('ASCII')) + mime_msg = email.message_from_bytes(body) + + # If the message body contains HTML, parse it with BeautifulSoup + if 'text/html' in mime_msg: + soup = BeautifulSoup(body, 'html.parser') + body = soup.get_text() + return str(body) + except Exception as e: + raise Exception("Can't parse message body" + str(e)) + + +if __name__ == "__main__": + reader = GmailReader(query="from:me after:2023-01-01") + print( + reader.load_data() + ) diff --git a/loader_hub/gmail/requirements.txt b/loader_hub/gmail/requirements.txt new file mode 100644 index 00000000..fcf4511e --- /dev/null +++ b/loader_hub/gmail/requirements.txt @@ -0,0 +1,4 @@ +google-api-python-client +google-auth-httplib2 +google-auth-oauthlib +beautifulsoup4 \ No newline at end of file diff --git a/loader_hub/library.json b/loader_hub/library.json index 15f3a3fa..9b2f5e9b 100644 --- a/loader_hub/library.json +++ b/loader_hub/library.json @@ -324,9 +324,14 @@ "author": "bbornsztein", "keywords": ["wordpress", "blog"] }, - "SteamshipFileReader": { + "GmailReader": { + "id": "gmail", + "author": "bbornsztein", + "keywords": ["gmail", "email"] + }, + "SteamshipFileReader": { "id": "steamship", "author": "douglas-reid", "keywords": ["steamship"] } -} \ No newline at end of file +}