mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-11-01 18:29:53 +00:00
Merge pull request #92 from AgentHQ/main
This commit is contained in:
commit
fdc44a79cb
21
loader_hub/gmail/README.md
Normal file
21
loader_hub/gmail/README.md
Normal file
@ -0,0 +1,21 @@
|
||||
# Gmail Loader
|
||||
|
||||
This loader seaches your Gmail account and parses the resulting emails into `Document`s. The search query can include normal query params, like `from: email@example.com label:inbox`.
|
||||
|
||||
As a prerequisite, you will need to register with Google and generate a `credentials.json` file in the directory where you run this loader. See [here](https://developers.google.com/workspace/guides/create-credentials) for instructions.
|
||||
|
||||
## Usage
|
||||
|
||||
To use this loader, you simply need to pass in a search query string.
|
||||
|
||||
```python
|
||||
from llama_index import download_loader
|
||||
|
||||
GmailReader = download_loader('GmailReader')
|
||||
loader = GmailReader(query="from: me label:inbox")
|
||||
documents = loader.load_data()
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent.
|
||||
1
loader_hub/gmail/__init__.py
Normal file
1
loader_hub/gmail/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
139
loader_hub/gmail/base.py
Normal file
139
loader_hub/gmail/base.py
Normal file
@ -0,0 +1,139 @@
|
||||
"""Google Mail reader."""
|
||||
import email
|
||||
from typing import Any, List
|
||||
from llama_index.readers.base import BaseReader
|
||||
from llama_index.readers.schema.base import Document
|
||||
from pydantic import BaseModel
|
||||
import base64
|
||||
|
||||
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
|
||||
|
||||
|
||||
class GmailReader(BaseReader, BaseModel):
|
||||
"""Gmail reader.
|
||||
|
||||
Reads emails
|
||||
|
||||
Args:
|
||||
query (str): Gmail query. Defaults to None.
|
||||
max_results (int): Max number of results. Defaults to 10.
|
||||
"""
|
||||
query: str = None
|
||||
max_results: int = 10
|
||||
service: Any
|
||||
|
||||
def load_data(
|
||||
self
|
||||
) -> List[Document]:
|
||||
"""Load emails from the user's account
|
||||
"""
|
||||
from googleapiclient.discovery import build
|
||||
|
||||
credentials = self._get_credentials()
|
||||
if not self.service:
|
||||
self.service = build('gmail', 'v1', credentials=credentials)
|
||||
|
||||
messsages = self.search_messages()
|
||||
|
||||
results = []
|
||||
for message in messsages:
|
||||
text = message.pop('body')
|
||||
extra_info = message
|
||||
results.append(Document(text, extra_info=extra_info))
|
||||
|
||||
return results
|
||||
|
||||
def _get_credentials(self) -> Any:
|
||||
"""Get valid user credentials from storage.
|
||||
|
||||
The file token.json stores the user's access and refresh tokens, and is
|
||||
created automatically when the authorization flow completes for the first
|
||||
time.
|
||||
|
||||
Returns:
|
||||
Credentials, the obtained credential.
|
||||
"""
|
||||
import os
|
||||
from google.auth.transport.requests import Request
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
|
||||
creds = None
|
||||
if os.path.exists("token.json"):
|
||||
creds = Credentials.from_authorized_user_file("token.json", SCOPES)
|
||||
# If there are no (valid) credentials available, let the user log in.
|
||||
if not creds or not creds.valid:
|
||||
if creds and creds.expired and creds.refresh_token:
|
||||
creds.refresh(Request())
|
||||
else:
|
||||
flow = InstalledAppFlow.from_client_secrets_file(
|
||||
"credentials.json", SCOPES
|
||||
)
|
||||
creds = flow.run_local_server(port=0)
|
||||
# Save the credentials for the next run
|
||||
with open("token.json", "w") as token:
|
||||
token.write(creds.to_json())
|
||||
|
||||
return creds
|
||||
|
||||
def search_messages(self):
|
||||
query = self.query
|
||||
|
||||
max_results = self.max_results
|
||||
|
||||
messages = self.service.users().messages().list(
|
||||
userId='me',
|
||||
q=query,
|
||||
maxResults=int(max_results)
|
||||
).execute().get('messages', [])
|
||||
|
||||
result = []
|
||||
try:
|
||||
for message in messages:
|
||||
message_data = self.get_message_data(message)
|
||||
if not message_data:
|
||||
continue
|
||||
result.append(message_data)
|
||||
except Exception as e:
|
||||
raise Exception("Can't get message data" + str(e))
|
||||
|
||||
return result
|
||||
|
||||
def get_message_data(self, message):
|
||||
message_id = message['id']
|
||||
message_data = self.service.users().messages().get(
|
||||
format="raw",
|
||||
userId='me',
|
||||
id=message_id).execute()
|
||||
body = self.extract_message_body(message_data)
|
||||
|
||||
if not body:
|
||||
return None
|
||||
|
||||
return {
|
||||
"id": message_data['id'],
|
||||
"threadId": message_data['threadId'],
|
||||
"snippet": message_data['snippet'],
|
||||
"body": body,
|
||||
}
|
||||
|
||||
def extract_message_body(self, message: dict):
|
||||
from bs4 import BeautifulSoup
|
||||
try:
|
||||
body = base64.urlsafe_b64decode(message['raw'].encode('ASCII'))
|
||||
mime_msg = email.message_from_bytes(body)
|
||||
|
||||
# If the message body contains HTML, parse it with BeautifulSoup
|
||||
if 'text/html' in mime_msg:
|
||||
soup = BeautifulSoup(body, 'html.parser')
|
||||
body = soup.get_text()
|
||||
return str(body)
|
||||
except Exception as e:
|
||||
raise Exception("Can't parse message body" + str(e))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
reader = GmailReader(query="from:me after:2023-01-01")
|
||||
print(
|
||||
reader.load_data()
|
||||
)
|
||||
4
loader_hub/gmail/requirements.txt
Normal file
4
loader_hub/gmail/requirements.txt
Normal file
@ -0,0 +1,4 @@
|
||||
google-api-python-client
|
||||
google-auth-httplib2
|
||||
google-auth-oauthlib
|
||||
beautifulsoup4
|
||||
@ -324,9 +324,14 @@
|
||||
"author": "bbornsztein",
|
||||
"keywords": ["wordpress", "blog"]
|
||||
},
|
||||
"SteamshipFileReader": {
|
||||
"GmailReader": {
|
||||
"id": "gmail",
|
||||
"author": "bbornsztein",
|
||||
"keywords": ["gmail", "email"]
|
||||
},
|
||||
"SteamshipFileReader": {
|
||||
"id": "steamship",
|
||||
"author": "douglas-reid",
|
||||
"keywords": ["steamship"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user