mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-16 04:31:28 +00:00
94 lines
3.5 KiB
Python
94 lines
3.5 KiB
Python
"""Confluence reader."""
|
|
import os
|
|
from typing import List, Optional
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
CONFLUENCE_USERNAME = "CONFLUENCE_USERNAME"
|
|
CONFLUENCE_API_TOKEN = "CONFLUENCE_API_TOKEN"
|
|
|
|
|
|
class ConfluenceReader(BaseReader):
|
|
"""Confluence reader.
|
|
|
|
Reads a set of confluence pages given a space id and optionally a list of page ids
|
|
|
|
Args:
|
|
user_name (str): your confluence username.
|
|
api_token (str): api token for your account (https://support.atlassian.com/atlassian-account/docs/manage-api-tokens-for-your-atlassian-account/)
|
|
base_url (str): 'base_url' for confluence instance, this is suffixed with '/wiki', eg 'https://yoursite.atlassian.com/wiki'
|
|
|
|
"""
|
|
|
|
def __init__(self, user_name: Optional[str] = None, api_token: Optional[str] = None, base_url: str = None) -> None:
|
|
|
|
try:
|
|
from atlassian import Confluence
|
|
except ImportError:
|
|
raise ImportError("`atlassian` package not found, please run `pip install atlassian-python-api`")
|
|
|
|
if user_name is None:
|
|
user_name = os.getenv(CONFLUENCE_USERNAME)
|
|
if user_name is None:
|
|
raise ValueError(
|
|
"Must specify `user_name` or set environment "
|
|
"variable `CONFLUENCE_USERNAME`."
|
|
)
|
|
if api_token is None:
|
|
api_token = os.getenv(CONFLUENCE_API_TOKEN)
|
|
if api_token is None:
|
|
raise ValueError(
|
|
"Must specify `api_token` or set environment "
|
|
"variable `CONFLUENCE_API_TOKEN`."
|
|
)
|
|
|
|
self.confluence = Confluence(url=base_url, username=user_name, password=api_token, cloud=True)
|
|
|
|
def load_data(self, space_id: Optional[str] = None, page_ids: Optional[List[str]] = None) -> List[Document]:
|
|
"""Load data from the confluence instance.
|
|
|
|
Args:
|
|
space_id (Optional[str]): confluence space id - all pages from this space will be loaded
|
|
page_ids (Optional[List[str]]): list of page ids from the given confluence site to load
|
|
|
|
if both are specified, the union of both sets will be returned.
|
|
|
|
Returns:
|
|
List[Document]: List of documents.
|
|
|
|
"""
|
|
|
|
try:
|
|
import html2text # type: ignore
|
|
except ImportError:
|
|
raise ImportError("`html2text` package not found, please run `pip install html2text`")
|
|
|
|
page_ids = page_ids or []
|
|
if not space_id and len(page_ids) == 0:
|
|
raise ValueError("Must specify either `space_id` or `page_ids` or both.")
|
|
|
|
docs = []
|
|
|
|
text_maker = html2text.HTML2Text()
|
|
text_maker.ignore_links = True
|
|
text_maker.ignore_images = True
|
|
|
|
if space_id:
|
|
pages = self.confluence.get_all_pages_from_space(space=space_id, expand='body.storage.value')
|
|
|
|
for page in pages:
|
|
docs.append(Document(text=text_maker.handle(page['body']['storage']['value']), doc_id=page['id'],
|
|
extra_info={"title": page['title']}))
|
|
if len(page_ids) != 0:
|
|
for page_id in page_ids:
|
|
page = self.confluence.get_page_by_id(page_id=page_id, expand='body.storage.value')
|
|
docs.append(Document(text=text_maker.handle(page['body']['storage']['value']), doc_id=page['id'],
|
|
extra_info={"title": page['title']}))
|
|
|
|
return docs
|
|
|
|
|
|
if __name__ == "__main__":
|
|
reader = ConfluenceReader()
|