2023-03-05 15:06:10 -06:00
|
|
|
"""Zendesk reader."""
|
|
|
|
import json
|
2023-03-06 11:49:42 -06:00
|
|
|
from typing import List
|
|
|
|
|
2023-03-05 15:06:10 -06:00
|
|
|
from llama_index.readers.base import BaseReader
|
|
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
|
|
|
|
|
|
class ZendeskReader(BaseReader):
|
|
|
|
"""Zendesk reader. Reads data from a Zendesk workspace.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
zendesk_subdomain (str): Zendesk subdomain
|
2023-03-21 01:56:43 +09:00
|
|
|
locale (str): Locale of articles
|
2023-03-05 15:06:10 -06:00
|
|
|
"""
|
|
|
|
|
2023-03-21 01:56:43 +09:00
|
|
|
def __init__(self, zendesk_subdomain: str, locale: str = "en-us") -> None:
|
2023-03-05 15:06:10 -06:00
|
|
|
"""Initialize Zendesk reader."""
|
|
|
|
self.zendesk_subdomain = zendesk_subdomain
|
2023-03-21 01:56:43 +09:00
|
|
|
self.locale = locale
|
2023-03-05 15:06:10 -06:00
|
|
|
|
|
|
|
def load_data(self) -> List[Document]:
|
|
|
|
"""Load data from the workspace.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
workspace_id (str): Workspace ID.
|
|
|
|
Returns:
|
|
|
|
List[Document]: List of documents.
|
|
|
|
"""
|
2023-03-06 11:49:42 -06:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
2023-03-05 15:06:10 -06:00
|
|
|
results = []
|
|
|
|
|
|
|
|
articles = self.get_all_articles()
|
|
|
|
for article in articles:
|
2023-03-06 11:49:42 -06:00
|
|
|
body = article["body"]
|
|
|
|
soup = BeautifulSoup(body, "html.parser")
|
2023-03-05 15:06:10 -06:00
|
|
|
body = soup.get_text()
|
|
|
|
extra_info = {
|
2023-03-06 11:49:42 -06:00
|
|
|
"id": article["id"],
|
|
|
|
"title": article["title"],
|
|
|
|
"url": article["html_url"],
|
|
|
|
"updated_at": article["updated_at"],
|
2023-03-05 15:06:10 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
results.append(
|
|
|
|
Document(
|
|
|
|
body,
|
|
|
|
extra_info=extra_info,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
def get_all_articles(self):
|
|
|
|
articles = []
|
|
|
|
next_page = None
|
|
|
|
|
|
|
|
while True:
|
|
|
|
response = self.get_articles_page(next_page)
|
2023-03-06 11:49:42 -06:00
|
|
|
articles.extend(response["articles"])
|
|
|
|
next_page = response["next_page"]
|
2023-03-05 15:06:10 -06:00
|
|
|
|
|
|
|
if next_page is None:
|
|
|
|
break
|
|
|
|
|
|
|
|
return articles
|
|
|
|
|
|
|
|
def get_articles_page(self, next_page: str = None):
|
2023-03-06 11:49:42 -06:00
|
|
|
import requests
|
|
|
|
|
2023-03-05 15:06:10 -06:00
|
|
|
if next_page is None:
|
2023-03-21 01:56:43 +09:00
|
|
|
url = f"https://{self.zendesk_subdomain}.zendesk.com/api/v2/help_center/{self.locale}/articles?per_page=100"
|
2023-03-05 15:06:10 -06:00
|
|
|
else:
|
|
|
|
url = next_page
|
|
|
|
|
|
|
|
response = requests.get(url)
|
|
|
|
|
|
|
|
response_json = json.loads(response.text)
|
|
|
|
|
2023-03-06 11:49:42 -06:00
|
|
|
next_page = response_json.get("next_page", None)
|
2023-03-05 15:06:10 -06:00
|
|
|
|
2023-03-06 11:49:42 -06:00
|
|
|
articles = response_json.get("articles", [])
|
2023-03-05 15:06:10 -06:00
|
|
|
|
2023-03-06 11:49:42 -06:00
|
|
|
return {"articles": articles, "next_page": next_page}
|