mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-13 11:11:48 +00:00
164 lines
5.2 KiB
Python
164 lines
5.2 KiB
Python
"""Google Mail reader."""
|
|
import email
|
|
from typing import Any, List
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
from pydantic import BaseModel
|
|
import base64
|
|
|
|
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
|
|
|
|
|
|
class GmailReader(BaseReader, BaseModel):
|
|
"""Gmail reader.
|
|
|
|
Reads emails
|
|
|
|
Args:
|
|
query (str): Gmail query. Defaults to None.
|
|
max_results (int): Max number of results. Defaults to 10.
|
|
"""
|
|
query: str = None
|
|
use_iterative_parser: bool = False
|
|
max_results: int = 10
|
|
service: Any
|
|
|
|
def load_data(
|
|
self
|
|
) -> List[Document]:
|
|
"""Load emails from the user's account
|
|
"""
|
|
from googleapiclient.discovery import build
|
|
|
|
credentials = self._get_credentials()
|
|
if not self.service:
|
|
self.service = build('gmail', 'v1', credentials=credentials)
|
|
|
|
messsages = self.search_messages()
|
|
|
|
results = []
|
|
for message in messsages:
|
|
text = message.pop('body')
|
|
extra_info = message
|
|
results.append(Document(text, extra_info=extra_info))
|
|
|
|
return results
|
|
|
|
def _get_credentials(self) -> Any:
|
|
"""Get valid user credentials from storage.
|
|
|
|
The file token.json stores the user's access and refresh tokens, and is
|
|
created automatically when the authorization flow completes for the first
|
|
time.
|
|
|
|
Returns:
|
|
Credentials, the obtained credential.
|
|
"""
|
|
import os
|
|
from google.auth.transport.requests import Request
|
|
from google.oauth2.credentials import Credentials
|
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
|
|
|
creds = None
|
|
if os.path.exists("token.json"):
|
|
creds = Credentials.from_authorized_user_file("token.json", SCOPES)
|
|
# If there are no (valid) credentials available, let the user log in.
|
|
if not creds or not creds.valid:
|
|
if creds and creds.expired and creds.refresh_token:
|
|
creds.refresh(Request())
|
|
else:
|
|
flow = InstalledAppFlow.from_client_secrets_file(
|
|
"credentials.json", SCOPES
|
|
)
|
|
creds = flow.run_local_server(port=8080)
|
|
# Save the credentials for the next run
|
|
with open("token.json", "w") as token:
|
|
token.write(creds.to_json())
|
|
|
|
return creds
|
|
|
|
def search_messages(self):
|
|
query = self.query
|
|
|
|
max_results = self.max_results
|
|
|
|
messages = self.service.users().messages().list(
|
|
userId='me',
|
|
q=query,
|
|
maxResults=int(max_results)
|
|
).execute().get('messages', [])
|
|
|
|
result = []
|
|
try:
|
|
for message in messages:
|
|
message_data = self.get_message_data(message)
|
|
if not message_data:
|
|
continue
|
|
result.append(message_data)
|
|
except Exception as e:
|
|
raise Exception("Can't get message data" + str(e))
|
|
|
|
return result
|
|
|
|
def get_message_data(self, message):
|
|
message_id = message['id']
|
|
message_data = self.service.users().messages().get(
|
|
format="raw",
|
|
userId='me',
|
|
id=message_id).execute()
|
|
if self.use_iterative_parser:
|
|
body = self.extract_message_body_iterative(message_data)
|
|
else:
|
|
body = self.extract_message_body(message_data)
|
|
|
|
if not body:
|
|
return None
|
|
|
|
return {
|
|
"id": message_data['id'],
|
|
"threadId": message_data['threadId'],
|
|
"snippet": message_data['snippet'],
|
|
"body": body,
|
|
}
|
|
|
|
def extract_message_body_iterative(self, message:dict):
|
|
if message['raw']:
|
|
body = base64.urlsafe_b64decode(message['raw'].encode('utf8'))
|
|
mime_msg = email.message_from_bytes(body)
|
|
else:
|
|
mime_msg = message
|
|
|
|
body_text = ''
|
|
if mime_msg.get_content_type() == 'text/plain':
|
|
plain_text = mime_msg.get_payload(decode=True)
|
|
charset = mime_msg.get_content_charset('utf-8')
|
|
body_text = plain_text.decode(charset).encode('utf-8').decode('utf-8')
|
|
|
|
elif mime_msg.get_content_maintype() == 'multipart':
|
|
msg_parts = mime_msg.get_payload()
|
|
for msg_part in msg_parts:
|
|
body_text += self.extract_message_body_iterative(msg_part)
|
|
|
|
return body_text
|
|
|
|
def extract_message_body(self, message: dict):
|
|
from bs4 import BeautifulSoup
|
|
try:
|
|
body = base64.urlsafe_b64decode(message['raw'].encode('ASCII'))
|
|
mime_msg = email.message_from_bytes(body)
|
|
|
|
# If the message body contains HTML, parse it with BeautifulSoup
|
|
if 'text/html' in mime_msg:
|
|
soup = BeautifulSoup(body, 'html.parser')
|
|
body = soup.get_text()
|
|
return body.decode("ascii")
|
|
except Exception as e:
|
|
raise Exception("Can't parse message body" + str(e))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
reader = GmailReader(query="from:me after:2023-01-01")
|
|
print(
|
|
reader.load_data()
|
|
)
|