ragflow/common/data_source/confluence_connector.py
Yongteng Lei 465a140727
Feat: refine Confluence connector (#10994)
### What problem does this PR solve?

Refine Confluence connector.
#10953

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2025-11-04 17:29:11 +08:00

2010 lines
78 KiB
Python

"""Confluence connector"""
import copy
import json
import logging
import time
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import Any, cast, Iterator, Callable, Generator
import requests
from typing_extensions import override
from urllib.parse import quote
import bs4
from atlassian.errors import ApiError
from atlassian import Confluence
from requests.exceptions import HTTPError
from common.data_source.config import INDEX_BATCH_SIZE, DocumentSource, CONTINUE_ON_CONNECTOR_FAILURE, \
CONFLUENCE_CONNECTOR_LABELS_TO_SKIP, CONFLUENCE_TIMEZONE_OFFSET, CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE, \
OAUTH_CONFLUENCE_CLOUD_CLIENT_ID, OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET, _DEFAULT_PAGINATION_LIMIT, \
_PROBLEMATIC_EXPANSIONS, _REPLACEMENT_EXPANSIONS, _USER_NOT_FOUND, _COMMENT_EXPANSION_FIELDS, \
_ATTACHMENT_EXPANSION_FIELDS, _PAGE_EXPANSION_FIELDS, ONE_DAY, ONE_HOUR, _RESTRICTIONS_EXPANSION_FIELDS, \
_SLIM_DOC_BATCH_SIZE, CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD
from common.data_source.exceptions import (
ConnectorMissingCredentialError,
ConnectorValidationError,
InsufficientPermissionsError,
UnexpectedValidationError, CredentialExpiredError
)
from common.data_source.html_utils import format_document_soup
from common.data_source.interfaces import (
ConnectorCheckpoint,
CredentialsConnector,
SecondsSinceUnixEpoch,
SlimConnectorWithPermSync, StaticCredentialsProvider, CheckpointedConnector, SlimConnector,
CredentialsProviderInterface, ConfluenceUser, IndexingHeartbeatInterface, AttachmentProcessingResult,
CheckpointOutput
)
from common.data_source.models import ConnectorFailure, Document, TextSection, ImageSection, BasicExpertInfo, \
DocumentFailure, GenerateSlimDocumentOutput, SlimDocument, ExternalAccess
from common.data_source.utils import load_all_docs_from_checkpoint_connector, scoped_url, \
process_confluence_user_profiles_override, confluence_refresh_tokens, run_with_timeout, _handle_http_error, \
update_param_in_path, get_start_param_from_url, build_confluence_document_id, datetime_from_string, \
is_atlassian_date_error, validate_attachment_filetype
from rag.utils.redis_conn import RedisDB, REDIS_CONN
_USER_ID_TO_DISPLAY_NAME_CACHE: dict[str, str | None] = {}
_USER_EMAIL_CACHE: dict[str, str | None] = {}
class ConfluenceCheckpoint(ConnectorCheckpoint):
next_page_url: str | None
class ConfluenceRateLimitError(Exception):
pass
class OnyxConfluence:
"""
This is a custom Confluence class that:
A. overrides the default Confluence class to add a custom CQL method.
B.
This is necessary because the default Confluence class does not properly support cql expansions.
All methods are automatically wrapped with handle_confluence_rate_limit.
"""
CREDENTIAL_PREFIX = "connector:confluence:credential"
CREDENTIAL_TTL = 300 # 5 min
PROBE_TIMEOUT = 5 # 5 seconds
def __init__(
self,
is_cloud: bool,
url: str,
credentials_provider: CredentialsProviderInterface,
timeout: int | None = None,
scoped_token: bool = False,
# should generally not be passed in, but making it overridable for
# easier testing
confluence_user_profiles_override: list[dict[str, str]] | None = (
CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE
),
) -> None:
self.base_url = url #'/'.join(url.rstrip("/").split("/")[:-1])
url = scoped_url(url, "confluence") if scoped_token else url
self._is_cloud = is_cloud
self._url = url.rstrip("/")
self._credentials_provider = credentials_provider
self.scoped_token = scoped_token
self.redis_client: RedisDB | None = None
self.static_credentials: dict[str, Any] | None = None
if self._credentials_provider.is_dynamic():
self.redis_client = REDIS_CONN
else:
self.static_credentials = self._credentials_provider.get_credentials()
self._confluence = Confluence(url)
self.credential_key: str = (
self.CREDENTIAL_PREFIX
+ f":credential_{self._credentials_provider.get_provider_key()}"
)
self._kwargs: Any = None
self.shared_base_kwargs: dict[str, str | int | bool] = {
"api_version": "cloud" if is_cloud else "latest",
"backoff_and_retry": True,
"cloud": is_cloud,
}
if timeout:
self.shared_base_kwargs["timeout"] = timeout
self._confluence_user_profiles_override = (
process_confluence_user_profiles_override(confluence_user_profiles_override)
if confluence_user_profiles_override
else None
)
def _renew_credentials(self) -> tuple[dict[str, Any], bool]:
"""credential_json - the current json credentials
Returns a tuple
1. The up to date credentials
2. True if the credentials were updated
This method is intended to be used within a distributed lock.
Lock, call this, update credentials if the tokens were refreshed, then release
"""
# static credentials are preloaded, so no locking/redis required
if self.static_credentials:
return self.static_credentials, False
if not self.redis_client:
raise RuntimeError("self.redis_client is None")
# dynamic credentials need locking
# check redis first, then fallback to the DB
credential_raw = self.redis_client.get(self.credential_key)
if credential_raw is not None:
credential_bytes = cast(bytes, credential_raw)
credential_str = credential_bytes.decode("utf-8")
credential_json: dict[str, Any] = json.loads(credential_str)
else:
credential_json = self._credentials_provider.get_credentials()
if "confluence_refresh_token" not in credential_json:
# static credentials ... cache them permanently and return
self.static_credentials = credential_json
return credential_json, False
if not OAUTH_CONFLUENCE_CLOUD_CLIENT_ID:
raise RuntimeError("OAUTH_CONFLUENCE_CLOUD_CLIENT_ID must be set!")
if not OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET:
raise RuntimeError("OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET must be set!")
# check if we should refresh tokens. we're deciding to refresh halfway
# to expiration
now = datetime.now(timezone.utc)
created_at = datetime.fromisoformat(credential_json["created_at"])
expires_in: int = credential_json["expires_in"]
renew_at = created_at + timedelta(seconds=expires_in // 2)
if now <= renew_at:
# cached/current credentials are reasonably up to date
return credential_json, False
# we need to refresh
logging.info("Renewing Confluence Cloud credentials...")
new_credentials = confluence_refresh_tokens(
OAUTH_CONFLUENCE_CLOUD_CLIENT_ID,
OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET,
credential_json["cloud_id"],
credential_json["confluence_refresh_token"],
)
# store the new credentials to redis and to the db thru the provider
# redis: we use a 5 min TTL because we are given a 10 minute grace period
# when keys are rotated. it's easier to expire the cached credentials
# reasonably frequently rather than trying to handle strong synchronization
# between the db and redis everywhere the credentials might be updated
new_credential_str = json.dumps(new_credentials)
self.redis_client.set(
self.credential_key, new_credential_str, nx=True, ex=self.CREDENTIAL_TTL
)
self._credentials_provider.set_credentials(new_credentials)
return new_credentials, True
@staticmethod
def _make_oauth2_dict(credentials: dict[str, Any]) -> dict[str, Any]:
oauth2_dict: dict[str, Any] = {}
if "confluence_refresh_token" in credentials:
oauth2_dict["client_id"] = OAUTH_CONFLUENCE_CLOUD_CLIENT_ID
oauth2_dict["token"] = {}
oauth2_dict["token"]["access_token"] = credentials[
"confluence_access_token"
]
return oauth2_dict
def _probe_connection(
self,
**kwargs: Any,
) -> None:
merged_kwargs = {**self.shared_base_kwargs, **kwargs}
# add special timeout to make sure that we don't hang indefinitely
merged_kwargs["timeout"] = self.PROBE_TIMEOUT
with self._credentials_provider:
credentials, _ = self._renew_credentials()
if self.scoped_token:
# v2 endpoint doesn't always work with scoped tokens, use v1
token = credentials["confluence_access_token"]
probe_url = f"{self.base_url}/rest/api/space?limit=1"
import requests
logging.info(f"First and Last 5 of token: {token[:5]}...{token[-5:]}")
try:
r = requests.get(
probe_url,
headers={"Authorization": f"Bearer {token}"},
timeout=10,
)
r.raise_for_status()
except HTTPError as e:
if e.response.status_code == 403:
logging.warning(
"scoped token authenticated but not valid for probe endpoint (spaces)"
)
else:
if "WWW-Authenticate" in e.response.headers:
logging.warning(
f"WWW-Authenticate: {e.response.headers['WWW-Authenticate']}"
)
logging.warning(f"Full error: {e.response.text}")
raise e
return
# probe connection with direct client, no retries
if "confluence_refresh_token" in credentials:
logging.info("Probing Confluence with OAuth Access Token.")
oauth2_dict: dict[str, Any] = OnyxConfluence._make_oauth2_dict(
credentials
)
url = (
f"https://api.atlassian.com/ex/confluence/{credentials['cloud_id']}"
)
confluence_client_with_minimal_retries = Confluence(
url=url, oauth2=oauth2_dict, **merged_kwargs
)
else:
logging.info("Probing Confluence with Personal Access Token.")
url = self._url
if self._is_cloud:
logging.info("running with cloud client")
confluence_client_with_minimal_retries = Confluence(
url=url,
username=credentials["confluence_username"],
password=credentials["confluence_access_token"],
**merged_kwargs,
)
else:
confluence_client_with_minimal_retries = Confluence(
url=url,
token=credentials["confluence_access_token"],
**merged_kwargs,
)
# This call sometimes hangs indefinitely, so we run it in a timeout
spaces = run_with_timeout(
timeout=10,
func=confluence_client_with_minimal_retries.get_all_spaces,
limit=1,
)
# uncomment the following for testing
# the following is an attempt to retrieve the user's timezone
# Unfornately, all data is returned in UTC regardless of the user's time zone
# even tho CQL parses incoming times based on the user's time zone
# space_key = spaces["results"][0]["key"]
# space_details = confluence_client_with_minimal_retries.cql(f"space.key={space_key}+AND+type=space")
if not spaces:
raise RuntimeError(
f"No spaces found at {url}! "
"Check your credentials and wiki_base and make sure "
"is_cloud is set correctly."
)
logging.info("Confluence probe succeeded.")
def _initialize_connection(
self,
**kwargs: Any,
) -> None:
"""Called externally to init the connection in a thread safe manner."""
merged_kwargs = {**self.shared_base_kwargs, **kwargs}
with self._credentials_provider:
credentials, _ = self._renew_credentials()
self._confluence = self._initialize_connection_helper(
credentials, **merged_kwargs
)
self._kwargs = merged_kwargs
def _initialize_connection_helper(
self,
credentials: dict[str, Any],
**kwargs: Any,
) -> Confluence:
"""Called internally to init the connection. Distributed locking
to prevent multiple threads from modifying the credentials
must be handled around this function."""
confluence = None
# probe connection with direct client, no retries
if "confluence_refresh_token" in credentials:
logging.info("Connecting to Confluence Cloud with OAuth Access Token.")
oauth2_dict: dict[str, Any] = OnyxConfluence._make_oauth2_dict(credentials)
url = f"https://api.atlassian.com/ex/confluence/{credentials['cloud_id']}"
confluence = Confluence(url=url, oauth2=oauth2_dict, **kwargs)
else:
logging.info(
f"Connecting to Confluence with Personal Access Token as user: {credentials['confluence_username']}"
)
if self._is_cloud:
confluence = Confluence(
url=self._url,
username=credentials["confluence_username"],
password=credentials["confluence_access_token"],
**kwargs,
)
else:
confluence = Confluence(
url=self._url,
token=credentials["confluence_access_token"],
**kwargs,
)
return confluence
# https://developer.atlassian.com/cloud/confluence/rate-limiting/
# This uses the native rate limiting option provided by the
# confluence client and otherwise applies a simpler set of error handling.
def _make_rate_limited_confluence_method(
self, name: str, credential_provider: CredentialsProviderInterface | None
) -> Callable[..., Any]:
def wrapped_call(*args: list[Any], **kwargs: Any) -> Any:
MAX_RETRIES = 5
TIMEOUT = 600
timeout_at = time.monotonic() + TIMEOUT
for attempt in range(MAX_RETRIES):
if time.monotonic() > timeout_at:
raise TimeoutError(
f"Confluence call attempts took longer than {TIMEOUT} seconds."
)
# we're relying more on the client to rate limit itself
# and applying our own retries in a more specific set of circumstances
try:
if credential_provider:
with credential_provider:
credentials, renewed = self._renew_credentials()
if renewed:
self._confluence = self._initialize_connection_helper(
credentials, **self._kwargs
)
attr = getattr(self._confluence, name, None)
if attr is None:
# The underlying Confluence client doesn't have this attribute
raise AttributeError(
f"'{type(self).__name__}' object has no attribute '{name}'"
)
return attr(*args, **kwargs)
else:
attr = getattr(self._confluence, name, None)
if attr is None:
# The underlying Confluence client doesn't have this attribute
raise AttributeError(
f"'{type(self).__name__}' object has no attribute '{name}'"
)
return attr(*args, **kwargs)
except HTTPError as e:
delay_until = _handle_http_error(e, attempt)
logging.warning(
f"HTTPError in confluence call. "
f"Retrying in {delay_until} seconds..."
)
while time.monotonic() < delay_until:
# in the future, check a signal here to exit
time.sleep(1)
except AttributeError as e:
# Some error within the Confluence library, unclear why it fails.
# Users reported it to be intermittent, so just retry
if attempt == MAX_RETRIES - 1:
raise e
logging.exception(
"Confluence Client raised an AttributeError. Retrying..."
)
time.sleep(5)
return wrapped_call
def __getattr__(self, name: str) -> Any:
"""Dynamically intercept attribute/method access."""
attr = getattr(self._confluence, name, None)
if attr is None:
# The underlying Confluence client doesn't have this attribute
raise AttributeError(
f"'{type(self).__name__}' object has no attribute '{name}'"
)
# If it's not a method, just return it after ensuring token validity
if not callable(attr):
return attr
# skip methods that start with "_"
if name.startswith("_"):
return attr
# wrap the method with our retry handler
rate_limited_method: Callable[..., Any] = (
self._make_rate_limited_confluence_method(name, self._credentials_provider)
)
return rate_limited_method
def _try_one_by_one_for_paginated_url(
self,
url_suffix: str,
initial_start: int,
limit: int,
) -> Generator[dict[str, Any], None, str | None]:
"""
Go through `limit` items, starting at `initial_start` one by one (e.g. using
`limit=1` for each call).
If we encounter an error, we skip the item and try the next one. We will return
the items we were able to retrieve successfully.
Returns the expected next url_suffix. Returns None if it thinks we've hit the end.
TODO (chris): make this yield failures as well as successes.
TODO (chris): make this work for confluence cloud somehow.
"""
if self._is_cloud:
raise RuntimeError("This method is not implemented for Confluence Cloud.")
found_empty_page = False
temp_url_suffix = url_suffix
for ind in range(limit):
try:
temp_url_suffix = update_param_in_path(
url_suffix, "start", str(initial_start + ind)
)
temp_url_suffix = update_param_in_path(temp_url_suffix, "limit", "1")
logging.info(f"Making recovery confluence call to {temp_url_suffix}")
raw_response = self.get(path=temp_url_suffix, advanced_mode=True)
raw_response.raise_for_status()
latest_results = raw_response.json().get("results", [])
yield from latest_results
if not latest_results:
# no more results, break out of the loop
logging.info(
f"No results found for call '{temp_url_suffix}'"
"Stopping pagination."
)
found_empty_page = True
break
except Exception:
logging.exception(
f"Error in confluence call to {temp_url_suffix}. Continuing."
)
if found_empty_page:
return None
# if we got here, we successfully tried `limit` items
return update_param_in_path(url_suffix, "start", str(initial_start + limit))
def _paginate_url(
self,
url_suffix: str,
limit: int | None = None,
# Called with the next url to use to get the next page
next_page_callback: Callable[[str], None] | None = None,
force_offset_pagination: bool = False,
) -> Iterator[dict[str, Any]]:
"""
This will paginate through the top level query.
"""
if not limit:
limit = _DEFAULT_PAGINATION_LIMIT
url_suffix = update_param_in_path(url_suffix, "limit", str(limit))
while url_suffix:
logging.debug(f"Making confluence call to {url_suffix}")
try:
raw_response = self.get(
path=url_suffix,
advanced_mode=True,
params={
"body-format": "atlas_doc_format",
"expand": "body.atlas_doc_format",
},
)
except Exception as e:
logging.exception(f"Error in confluence call to {url_suffix}")
raise e
try:
raw_response.raise_for_status()
except Exception as e:
logging.warning(f"Error in confluence call to {url_suffix}")
# If the problematic expansion is in the url, replace it
# with the replacement expansion and try again
# If that fails, raise the error
if _PROBLEMATIC_EXPANSIONS in url_suffix:
logging.warning(
f"Replacing {_PROBLEMATIC_EXPANSIONS} with {_REPLACEMENT_EXPANSIONS}"
" and trying again."
)
url_suffix = url_suffix.replace(
_PROBLEMATIC_EXPANSIONS,
_REPLACEMENT_EXPANSIONS,
)
continue
# If we fail due to a 500, try one by one.
# NOTE: this iterative approach only works for server, since cloud uses cursor-based
# pagination
if raw_response.status_code == 500 and not self._is_cloud:
initial_start = get_start_param_from_url(url_suffix)
if initial_start is None:
# can't handle this if we don't have offset-based pagination
raise
# this will just yield the successful items from the batch
new_url_suffix = yield from self._try_one_by_one_for_paginated_url(
url_suffix,
initial_start=initial_start,
limit=limit,
)
# this means we ran into an empty page
if new_url_suffix is None:
if next_page_callback:
next_page_callback("")
break
url_suffix = new_url_suffix
continue
else:
logging.exception(
f"Error in confluence call to {url_suffix} \n"
f"Raw Response Text: {raw_response.text} \n"
f"Full Response: {raw_response.__dict__} \n"
f"Error: {e} \n"
)
raise
try:
next_response = raw_response.json()
except Exception as e:
logging.exception(
f"Failed to parse response as JSON. Response: {raw_response.__dict__}"
)
raise e
# Yield the results individually.
results = cast(list[dict[str, Any]], next_response.get("results", []))
# Note 1:
# Make sure we don't update the start by more than the amount
# of results we were able to retrieve. The Confluence API has a
# weird behavior where if you pass in a limit that is too large for
# the configured server, it will artificially limit the amount of
# results returned BUT will not apply this to the start parameter.
# This will cause us to miss results.
#
# Note 2:
# We specifically perform manual yielding (i.e., `for x in xs: yield x`) as opposed to using a `yield from xs`
# because we *have to call the `next_page_callback`* prior to yielding the last element!
#
# If we did:
#
# ```py
# yield from results
# if next_page_callback:
# next_page_callback(url_suffix)
# ```
#
# then the logic would fail since the iterator would finish (and the calling scope would exit out of its driving
# loop) prior to the callback being called.
old_url_suffix = url_suffix
updated_start = get_start_param_from_url(old_url_suffix)
url_suffix = cast(str, next_response.get("_links", {}).get("next", ""))
for i, result in enumerate(results):
updated_start += 1
if url_suffix and next_page_callback and i == len(results) - 1:
# update the url if we're on the last result in the page
if not self._is_cloud:
# If confluence claims there are more results, we update the start param
# based on how many results were returned and try again.
url_suffix = update_param_in_path(
url_suffix, "start", str(updated_start)
)
# notify the caller of the new url
next_page_callback(url_suffix)
elif force_offset_pagination and i == len(results) - 1:
url_suffix = update_param_in_path(
old_url_suffix, "start", str(updated_start)
)
yield result
# we've observed that Confluence sometimes returns a next link despite giving
# 0 results. This is a bug with Confluence, so we need to check for it and
# stop paginating.
if url_suffix and not results:
logging.info(
f"No results found for call '{old_url_suffix}' despite next link "
"being present. Stopping pagination."
)
break
def build_cql_url(self, cql: str, expand: str | None = None) -> str:
expand_string = f"&expand={expand}" if expand else ""
return f"rest/api/content/search?cql={cql}{expand_string}"
def paginated_cql_retrieval(
self,
cql: str,
expand: str | None = None,
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
The content/search endpoint can be used to fetch pages, attachments, and comments.
"""
cql_url = self.build_cql_url(cql, expand)
yield from self._paginate_url(cql_url, limit)
def paginated_page_retrieval(
self,
cql_url: str,
limit: int,
# Called with the next url to use to get the next page
next_page_callback: Callable[[str], None] | None = None,
) -> Iterator[dict[str, Any]]:
"""
Error handling (and testing) wrapper for _paginate_url,
because the current approach to page retrieval involves handling the
next page links manually.
"""
try:
yield from self._paginate_url(
cql_url, limit=limit, next_page_callback=next_page_callback
)
except Exception as e:
logging.exception(f"Error in paginated_page_retrieval: {e}")
raise e
def cql_paginate_all_expansions(
self,
cql: str,
expand: str | None = None,
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
This function will paginate through the top level query first, then
paginate through all of the expansions.
"""
def _traverse_and_update(data: dict | list) -> None:
if isinstance(data, dict):
next_url = data.get("_links", {}).get("next")
if next_url and "results" in data:
data["results"].extend(self._paginate_url(next_url, limit=limit))
for value in data.values():
_traverse_and_update(value)
elif isinstance(data, list):
for item in data:
_traverse_and_update(item)
for confluence_object in self.paginated_cql_retrieval(cql, expand, limit):
_traverse_and_update(confluence_object)
yield confluence_object
def paginated_cql_user_retrieval(
self,
expand: str | None = None,
limit: int | None = None,
) -> Iterator[ConfluenceUser]:
"""
The search/user endpoint can be used to fetch users.
It's a separate endpoint from the content/search endpoint used only for users.
Otherwise it's very similar to the content/search endpoint.
"""
# this is needed since there is a live bug with Confluence Server/Data Center
# where not all users are returned by the APIs. This is a workaround needed until
# that is patched.
if self._confluence_user_profiles_override:
yield from self._confluence_user_profiles_override
elif self._is_cloud:
cql = "type=user"
url = "rest/api/search/user"
expand_string = f"&expand={expand}" if expand else ""
url += f"?cql={cql}{expand_string}"
for user_result in self._paginate_url(
url, limit, force_offset_pagination=True
):
user = user_result["user"]
yield ConfluenceUser(
user_id=user["accountId"],
username=None,
display_name=user["displayName"],
email=user.get("email"),
type=user["accountType"],
)
else:
for user in self._paginate_url("rest/api/user/list", limit):
yield ConfluenceUser(
user_id=user["userKey"],
username=user["username"],
display_name=user["displayName"],
email=None,
type=user.get("type", "user"),
)
def paginated_groups_by_user_retrieval(
self,
user_id: str, # accountId in Cloud, userKey in Server
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
This is not an SQL like query.
It's a confluence specific endpoint that can be used to fetch groups.
"""
user_field = "accountId" if self._is_cloud else "key"
user_value = user_id
# Server uses userKey (but calls it key during the API call), Cloud uses accountId
user_query = f"{user_field}={quote(user_value)}"
url = f"rest/api/user/memberof?{user_query}"
yield from self._paginate_url(url, limit, force_offset_pagination=True)
def paginated_groups_retrieval(
self,
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
This is not an SQL like query.
It's a confluence specific endpoint that can be used to fetch groups.
"""
yield from self._paginate_url("rest/api/group", limit)
def paginated_group_members_retrieval(
self,
group_name: str,
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
This is not an SQL like query.
It's a confluence specific endpoint that can be used to fetch the members of a group.
THIS DOESN'T WORK FOR SERVER because it breaks when there is a slash in the group name.
E.g. neither "test/group" nor "test%2Fgroup" works for confluence.
"""
group_name = quote(group_name)
yield from self._paginate_url(f"rest/api/group/{group_name}/member", limit)
def get_all_space_permissions_server(
self,
space_key: str,
) -> list[dict[str, Any]]:
"""
This is a confluence server specific method that can be used to
fetch the permissions of a space.
This is better logging than calling the get_space_permissions method
because it returns a jsonrpc response.
TODO: Make this call these endpoints for newer confluence versions:
- /rest/api/space/{spaceKey}/permissions
- /rest/api/space/{spaceKey}/permissions/anonymous
"""
url = "rpc/json-rpc/confluenceservice-v2"
data = {
"jsonrpc": "2.0",
"method": "getSpacePermissionSets",
"id": 7,
"params": [space_key],
}
response = self.post(url, data=data)
logging.debug(f"jsonrpc response: {response}")
if not response.get("result"):
logging.warning(
f"No jsonrpc response for space permissions for space {space_key}"
f"\nResponse: {response}"
)
return response.get("result", [])
def get_current_user(self, expand: str | None = None) -> Any:
"""
Implements a method that isn't in the third party client.
Get information about the current user
:param expand: OPTIONAL expand for get status of user.
Possible param is "status". Results are "Active, Deactivated"
:return: Returns the user details
"""
from atlassian.errors import ApiPermissionError # type:ignore
url = "rest/api/user/current"
params = {}
if expand:
params["expand"] = expand
try:
response = self.get(url, params=params)
except HTTPError as e:
if e.response.status_code == 403:
raise ApiPermissionError(
"The calling user does not have permission", reason=e
)
raise
return response
def get_user_email_from_username__server(
confluence_client: OnyxConfluence, user_name: str
) -> str | None:
global _USER_EMAIL_CACHE
if _USER_EMAIL_CACHE.get(user_name) is None:
try:
response = confluence_client.get_mobile_parameters(user_name)
email = response.get("email")
except Exception:
logging.warning(f"failed to get confluence email for {user_name}")
# For now, we'll just return None and log a warning. This means
# we will keep retrying to get the email every group sync.
email = None
# We may want to just return a string that indicates failure so we dont
# keep retrying
# email = f"FAILED TO GET CONFLUENCE EMAIL FOR {user_name}"
_USER_EMAIL_CACHE[user_name] = email
return _USER_EMAIL_CACHE[user_name]
def _get_user(confluence_client: OnyxConfluence, user_id: str) -> str:
"""Get Confluence Display Name based on the account-id or userkey value
Args:
user_id (str): The user id (i.e: the account-id or userkey)
confluence_client (Confluence): The Confluence Client
Returns:
str: The User Display Name. 'Unknown User' if the user is deactivated or not found
"""
global _USER_ID_TO_DISPLAY_NAME_CACHE
if _USER_ID_TO_DISPLAY_NAME_CACHE.get(user_id) is None:
try:
result = confluence_client.get_user_details_by_userkey(user_id)
found_display_name = result.get("displayName")
except Exception:
found_display_name = None
if not found_display_name:
try:
result = confluence_client.get_user_details_by_accountid(user_id)
found_display_name = result.get("displayName")
except Exception:
found_display_name = None
_USER_ID_TO_DISPLAY_NAME_CACHE[user_id] = found_display_name
return _USER_ID_TO_DISPLAY_NAME_CACHE.get(user_id) or _USER_NOT_FOUND
def sanitize_attachment_title(title: str) -> str:
"""
Sanitize the attachment title to be a valid HTML attribute.
"""
return title.replace("<", "_").replace(">", "_").replace(" ", "_").replace(":", "_")
def extract_text_from_confluence_html(
confluence_client: OnyxConfluence,
confluence_object: dict[str, Any],
fetched_titles: set[str],
) -> str:
"""Parse a Confluence html page and replace the 'user Id' by the real
User Display Name
Args:
confluence_object (dict): The confluence object as a dict
confluence_client (Confluence): Confluence client
fetched_titles (set[str]): The titles of the pages that have already been fetched
Returns:
str: loaded and formated Confluence page
"""
body = confluence_object["body"]
object_html = body.get("storage", body.get("view", {})).get("value")
soup = bs4.BeautifulSoup(object_html, "html.parser")
_remove_macro_stylings(soup=soup)
for user in soup.findAll("ri:user"):
user_id = (
user.attrs["ri:account-id"]
if "ri:account-id" in user.attrs
else user.get("ri:userkey")
)
if not user_id:
logging.warning(
"ri:userkey not found in ri:user element. " f"Found attrs: {user.attrs}"
)
continue
# Include @ sign for tagging, more clear for LLM
user.replaceWith("@" + _get_user(confluence_client, user_id))
for html_page_reference in soup.findAll("ac:structured-macro"):
# Here, we only want to process page within page macros
if html_page_reference.attrs.get("ac:name") != "include":
continue
page_data = html_page_reference.find("ri:page")
if not page_data:
logging.warning(
f"Skipping retrieval of {html_page_reference} because because page data is missing"
)
continue
page_title = page_data.attrs.get("ri:content-title")
if not page_title:
# only fetch pages that have a title
logging.warning(
f"Skipping retrieval of {html_page_reference} because it has no title"
)
continue
if page_title in fetched_titles:
# prevent recursive fetching of pages
logging.debug(f"Skipping {page_title} because it has already been fetched")
continue
fetched_titles.add(page_title)
# Wrap this in a try-except because there are some pages that might not exist
try:
page_query = f"type=page and title='{quote(page_title)}'"
page_contents: dict[str, Any] | None = None
# Confluence enforces title uniqueness, so we should only get one result here
for page in confluence_client.paginated_cql_retrieval(
cql=page_query,
expand="body.storage.value",
limit=1,
):
page_contents = page
break
except Exception as e:
logging.warning(
f"Error getting page contents for object {confluence_object}: {e}"
)
continue
if not page_contents:
continue
text_from_page = extract_text_from_confluence_html(
confluence_client=confluence_client,
confluence_object=page_contents,
fetched_titles=fetched_titles,
)
html_page_reference.replaceWith(text_from_page)
for html_link_body in soup.findAll("ac:link-body"):
# This extracts the text from inline links in the page so they can be
# represented in the document text as plain text
try:
text_from_link = html_link_body.text
html_link_body.replaceWith(f"(LINK TEXT: {text_from_link})")
except Exception as e:
logging.warning(f"Error processing ac:link-body: {e}")
for html_attachment in soup.findAll("ri:attachment"):
# This extracts the text from inline attachments in the page so they can be
# represented in the document text as plain text
try:
html_attachment.replaceWith(
f"<attachment>{sanitize_attachment_title(html_attachment.attrs['ri:filename'])}</attachment>"
) # to be replaced later
except Exception as e:
logging.warning(f"Error processing ac:attachment: {e}")
return format_document_soup(soup)
def _remove_macro_stylings(soup: bs4.BeautifulSoup) -> None:
for macro_root in soup.findAll("ac:structured-macro"):
if not isinstance(macro_root, bs4.Tag):
continue
macro_styling = macro_root.find(name="ac:parameter", attrs={"ac:name": "page"})
if not macro_styling or not isinstance(macro_styling, bs4.Tag):
continue
macro_styling.extract()
def get_page_restrictions(
confluence_client: OnyxConfluence,
page_id: str,
page_restrictions: dict[str, Any],
ancestors: list[dict[str, Any]],
) -> ExternalAccess | None:
"""
Get page access restrictions for a Confluence page.
This functionality requires Enterprise Edition.
Args:
confluence_client: OnyxConfluence client instance
page_id: The ID of the page
page_restrictions: Dictionary containing page restriction data
ancestors: List of ancestor pages with their restriction data
Returns:
ExternalAccess object for the page. None if EE is not enabled or no restrictions found.
"""
# Fetch the EE implementation
"""
ee_get_all_page_restrictions = cast(
Callable[
[OnyxConfluence, str, dict[str, Any], list[dict[str, Any]]],
ExternalAccess | None,
],
fetch_versioned_implementation(
"onyx.external_permissions.confluence.page_access", "get_page_restrictions"
),
)
return ee_get_all_page_restrictions(
confluence_client, page_id, page_restrictions, ancestors
)"""
return {}
def get_all_space_permissions(
confluence_client: OnyxConfluence,
is_cloud: bool,
) -> dict[str, ExternalAccess]:
"""
Get access permissions for all spaces in Confluence.
This functionality requires Enterprise Edition.
Args:
confluence_client: OnyxConfluence client instance
is_cloud: Whether this is a Confluence Cloud instance
Returns:
Dictionary mapping space keys to ExternalAccess objects. Empty dict if EE is not enabled.
"""
"""
# Fetch the EE implementation
ee_get_all_space_permissions = cast(
Callable[
[OnyxConfluence, bool],
dict[str, ExternalAccess],
],
fetch_versioned_implementation(
"onyx.external_permissions.confluence.space_access",
"get_all_space_permissions",
),
)
return ee_get_all_space_permissions(confluence_client, is_cloud)"""
return {}
def _make_attachment_link(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
parent_content_id: str | None = None,
) -> str | None:
download_link = ""
if "api.atlassian.com" in confluence_client.url:
# https://developer.atlassian.com/cloud/confluence/rest/v1/api-group-content---attachments/#api-wiki-rest-api-content-id-child-attachment-attachmentid-download-get
if not parent_content_id:
logging.warning(
"parent_content_id is required to download attachments from Confluence Cloud!"
)
return None
download_link = (
confluence_client.url
+ f"/rest/api/content/{parent_content_id}/child/attachment/{attachment['id']}/download"
)
else:
download_link = confluence_client.url + attachment["_links"]["download"]
return download_link
def _process_image_attachment(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
raw_bytes: bytes,
media_type: str,
) -> AttachmentProcessingResult:
"""Process an image attachment by saving it without generating a summary."""
return AttachmentProcessingResult(text="", file_blob=raw_bytes, file_name=attachment.get("title", "unknown_title"), error=None)
def process_attachment(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
parent_content_id: str | None,
allow_images: bool,
) -> AttachmentProcessingResult:
"""
Processes a Confluence attachment. If it's a document, extracts text,
or if it's an image, stores it for later analysis. Returns a structured result.
"""
try:
# Get the media type from the attachment metadata
media_type: str = attachment.get("metadata", {}).get("mediaType", "")
# Validate the attachment type
if not validate_attachment_filetype(attachment):
return AttachmentProcessingResult(
text=None,
file_blob=None,
file_name=None,
error=f"Unsupported file type: {media_type}",
)
attachment_link = _make_attachment_link(
confluence_client, attachment, parent_content_id
)
if not attachment_link:
return AttachmentProcessingResult(
text=None, file_blob=None, file_name=None, error="Failed to make attachment link"
)
attachment_size = attachment["extensions"]["fileSize"]
if media_type.startswith("image/"):
if not allow_images:
return AttachmentProcessingResult(
text=None,
file_blob=None,
file_name=None,
error="Image downloading is not enabled",
)
else:
if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD:
logging.warning(
f"Skipping {attachment_link} due to size. "
f"size={attachment_size} "
f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD}"
)
return AttachmentProcessingResult(
text=None,
file_blob=None,
file_name=None,
error=f"Attachment text too long: {attachment_size} chars",
)
logging.info(
f"Downloading attachment: "
f"title={attachment['title']} "
f"length={attachment_size} "
f"link={attachment_link}"
)
# Download the attachment
resp: requests.Response = confluence_client._session.get(attachment_link)
if resp.status_code != 200:
logging.warning(
f"Failed to fetch {attachment_link} with status code {resp.status_code}"
)
return AttachmentProcessingResult(
text=None,
file_blob=None,
file_name=None,
error=f"Attachment download status code is {resp.status_code}",
)
raw_bytes = resp.content
if not raw_bytes:
return AttachmentProcessingResult(
text=None, file_blob=None, file_name=None, error="attachment.content is None"
)
# Process image attachments
if media_type.startswith("image/"):
return _process_image_attachment(
confluence_client, attachment, raw_bytes, media_type
)
# Process document attachments
try:
return AttachmentProcessingResult(text="",file_blob=raw_bytes, file_name=attachment.get("title", "unknown_title"), error=None)
except Exception as e:
logging.exception(e)
return AttachmentProcessingResult(
text=None, file_blob=None, file_name=None, error=f"Failed to extract text: {e}"
)
except Exception as e:
return AttachmentProcessingResult(
text=None, file_blob=None, file_name=None, error=f"Failed to process attachment: {e}"
)
def convert_attachment_to_content(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
page_id: str,
allow_images: bool,
) -> tuple[str | None, bytes | bytearray | None] | None:
"""
Facade function which:
1. Validates attachment type
2. Extracts content or stores image for later processing
3. Returns (content_text, stored_file_name) or None if we should skip it
"""
media_type = attachment.get("metadata", {}).get("mediaType", "")
# Quick check for unsupported types:
if media_type.startswith("video/") or media_type == "application/gliffy+json":
logging.warning(
f"Skipping unsupported attachment type: '{media_type}' for {attachment['title']}"
)
return None
result = process_attachment(confluence_client, attachment, page_id, allow_images)
if result.error is not None:
logging.warning(
f"Attachment {attachment['title']} encountered error: {result.error}"
)
return None
return result.file_name, result.file_blob
class ConfluenceConnector(
CheckpointedConnector[ConfluenceCheckpoint],
SlimConnector,
SlimConnectorWithPermSync,
CredentialsConnector,
):
def __init__(
self,
wiki_base: str,
is_cloud: bool,
space: str = "",
page_id: str = "",
index_recursively: bool = False,
cql_query: str | None = None,
batch_size: int = INDEX_BATCH_SIZE,
continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
# if a page has one of the labels specified in this list, we will just
# skip it. This is generally used to avoid indexing extra sensitive
# pages.
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
timezone_offset: float = CONFLUENCE_TIMEZONE_OFFSET,
scoped_token: bool = False,
) -> None:
self.wiki_base = wiki_base
self.is_cloud = is_cloud
self.space = space
self.page_id = page_id
self.index_recursively = index_recursively
self.cql_query = cql_query
self.batch_size = batch_size
self.labels_to_skip = labels_to_skip
self.timezone_offset = timezone_offset
self.scoped_token = scoped_token
self._confluence_client: OnyxConfluence | None = None
self._low_timeout_confluence_client: OnyxConfluence | None = None
self._fetched_titles: set[str] = set()
self.allow_images = False
# Remove trailing slash from wiki_base if present
self.wiki_base = wiki_base.rstrip("/")
"""
If nothing is provided, we default to fetching all pages
Only one or none of the following options should be specified so
the order shouldn't matter
However, we use elif to ensure that only of the following is enforced
"""
base_cql_page_query = "type=page"
if cql_query:
base_cql_page_query = cql_query
elif page_id:
if index_recursively:
base_cql_page_query += f" and (ancestor='{page_id}' or id='{page_id}')"
else:
base_cql_page_query += f" and id='{page_id}'"
elif space:
uri_safe_space = quote(space)
base_cql_page_query += f" and space='{uri_safe_space}'"
self.base_cql_page_query = base_cql_page_query
self.cql_label_filter = ""
if labels_to_skip:
labels_to_skip = list(set(labels_to_skip))
comma_separated_labels = ",".join(
f"'{quote(label)}'" for label in labels_to_skip
)
self.cql_label_filter = f" and label not in ({comma_separated_labels})"
self.timezone: timezone = timezone(offset=timedelta(hours=timezone_offset))
self.credentials_provider: CredentialsProviderInterface | None = None
self.probe_kwargs = {
"max_backoff_retries": 6,
"max_backoff_seconds": 10,
}
self.final_kwargs = {
"max_backoff_retries": 10,
"max_backoff_seconds": 60,
}
# deprecated
self.continue_on_failure = continue_on_failure
def set_allow_images(self, value: bool) -> None:
logging.info(f"Setting allow_images to {value}.")
self.allow_images = value
@property
def confluence_client(self) -> OnyxConfluence:
if self._confluence_client is None:
raise ConnectorMissingCredentialError("Confluence")
return self._confluence_client
@property
def low_timeout_confluence_client(self) -> OnyxConfluence:
if self._low_timeout_confluence_client is None:
raise ConnectorMissingCredentialError("Confluence")
return self._low_timeout_confluence_client
def set_credentials_provider(
self, credentials_provider: CredentialsProviderInterface
) -> None:
self.credentials_provider = credentials_provider
# raises exception if there's a problem
confluence_client = OnyxConfluence(
is_cloud=self.is_cloud,
url=self.wiki_base,
credentials_provider=credentials_provider,
scoped_token=self.scoped_token,
)
confluence_client._probe_connection(**self.probe_kwargs)
confluence_client._initialize_connection(**self.final_kwargs)
self._confluence_client = confluence_client
# create a low timeout confluence client for sync flows
low_timeout_confluence_client = OnyxConfluence(
is_cloud=self.is_cloud,
url=self.wiki_base,
credentials_provider=credentials_provider,
timeout=3,
scoped_token=self.scoped_token,
)
low_timeout_confluence_client._probe_connection(**self.probe_kwargs)
low_timeout_confluence_client._initialize_connection(**self.final_kwargs)
self._low_timeout_confluence_client = low_timeout_confluence_client
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
raise NotImplementedError("Use set_credentials_provider with this connector.")
def _construct_page_cql_query(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> str:
"""
Constructs a CQL query for use in the confluence API. See
https://developer.atlassian.com/server/confluence/advanced-searching-using-cql/
for more information. This is JUST the CQL, not the full URL used to hit the API.
Use _build_page_retrieval_url to get the full URL.
"""
page_query = self.base_cql_page_query + self.cql_label_filter
# Add time filters
if start:
formatted_start_time = datetime.fromtimestamp(
start, tz=self.timezone
).strftime("%Y-%m-%d %H:%M")
page_query += f" and lastmodified >= '{formatted_start_time}'"
if end:
formatted_end_time = datetime.fromtimestamp(end, tz=self.timezone).strftime(
"%Y-%m-%d %H:%M"
)
page_query += f" and lastmodified <= '{formatted_end_time}'"
page_query += " order by lastmodified asc"
return page_query
def _construct_attachment_query(
self,
confluence_page_id: str,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> str:
attachment_query = f"type=attachment and container='{confluence_page_id}'"
attachment_query += self.cql_label_filter
# Add time filters to avoid reprocessing unchanged attachments during refresh
if start:
formatted_start_time = datetime.fromtimestamp(
start, tz=self.timezone
).strftime("%Y-%m-%d %H:%M")
attachment_query += f" and lastmodified >= '{formatted_start_time}'"
if end:
formatted_end_time = datetime.fromtimestamp(end, tz=self.timezone).strftime(
"%Y-%m-%d %H:%M"
)
attachment_query += f" and lastmodified <= '{formatted_end_time}'"
attachment_query += " order by lastmodified asc"
return attachment_query
def _get_comment_string_for_page_id(self, page_id: str) -> str:
comment_string = ""
comment_cql = f"type=comment and container='{page_id}'"
comment_cql += self.cql_label_filter
expand = ",".join(_COMMENT_EXPANSION_FIELDS)
for comment in self.confluence_client.paginated_cql_retrieval(
cql=comment_cql,
expand=expand,
):
comment_string += "\nComment:\n"
comment_string += extract_text_from_confluence_html(
confluence_client=self.confluence_client,
confluence_object=comment,
fetched_titles=set(),
)
return comment_string
def _convert_page_to_document(
self, page: dict[str, Any]
) -> Document | ConnectorFailure:
"""
Converts a Confluence page to a Document object.
Includes the page content, comments, and attachments.
"""
page_id = page_url = ""
try:
# Extract basic page information
page_id = page["id"]
page_title = page["title"]
logging.info(f"Converting page {page_title} to document")
page_url = build_confluence_document_id(
self.wiki_base, page["_links"]["webui"], self.is_cloud
)
# Get the page content
page_content = extract_text_from_confluence_html(
self.confluence_client, page, self._fetched_titles
)
# Create the main section for the page content
sections: list[TextSection | ImageSection] = [
TextSection(text=page_content, link=page_url)
]
# Process comments if available
comment_text = self._get_comment_string_for_page_id(page_id)
if comment_text:
sections.append(
TextSection(text=comment_text, link=f"{page_url}#comments")
)
# Note: attachments are no longer merged into the page document.
# They are indexed as separate documents downstream.
# Extract metadata
metadata = {}
if "space" in page:
metadata["space"] = page["space"].get("name", "")
# Extract labels
labels = []
if "metadata" in page and "labels" in page["metadata"]:
for label in page["metadata"]["labels"].get("results", []):
labels.append(label.get("name", ""))
if labels:
metadata["labels"] = labels
# Extract owners
primary_owners = []
if "version" in page and "by" in page["version"]:
author = page["version"]["by"]
display_name = author.get("displayName", "Unknown")
email = author.get("email", "unknown@domain.invalid")
primary_owners.append(
BasicExpertInfo(display_name=display_name, email=email)
)
# Create the document
return Document(
id=page_url,
source=DocumentSource.CONFLUENCE,
semantic_identifier=page_title,
extension=".html", # Confluence pages are HTML
blob=page_content.encode("utf-8"), # Encode page content as bytes
size_bytes=len(page_content.encode("utf-8")), # Calculate size in bytes
doc_updated_at=datetime_from_string(page["version"]["when"]),
primary_owners=primary_owners if primary_owners else None,
)
except Exception as e:
logging.error(f"Error converting page {page.get('id', 'unknown')}: {e}")
if is_atlassian_date_error(e): # propagate error to be caught and retried
raise
return ConnectorFailure(
failed_document=DocumentFailure(
document_id=page_id,
document_link=page_url,
),
failure_message=f"Error converting page {page.get('id', 'unknown')}: {e}",
exception=e,
)
def _fetch_page_attachments(
self,
page: dict[str, Any],
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> tuple[list[Document], list[ConnectorFailure]]:
"""
Inline attachments are added directly to the document as text or image sections by
this function. The returned documents/connectorfailures are for non-inline attachments
and those at the end of the page.
"""
attachment_query = self._construct_attachment_query(page["id"], start, end)
attachment_failures: list[ConnectorFailure] = []
attachment_docs: list[Document] = []
page_url = ""
for attachment in self.confluence_client.paginated_cql_retrieval(
cql=attachment_query,
expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
):
media_type: str = attachment.get("metadata", {}).get("mediaType", "")
# TODO(rkuo): this check is partially redundant with validate_attachment_filetype
# and checks in convert_attachment_to_content/process_attachment
# but doing the check here avoids an unnecessary download. Due for refactoring.
if not self.allow_images:
if media_type.startswith("image/"):
logging.info(
f"Skipping attachment because allow images is False: {attachment['title']}"
)
continue
if not validate_attachment_filetype(
attachment,
):
logging.info(
f"Skipping attachment because it is not an accepted file type: {attachment['title']}"
)
continue
logging.info(
f"Processing attachment: {attachment['title']} attached to page {page['title']}"
)
# Attachment document id: use the download URL for stable identity
try:
object_url = build_confluence_document_id(
self.wiki_base, attachment["_links"]["download"], self.is_cloud
)
except Exception as e:
logging.warning(
f"Invalid attachment url for id {attachment['id']}, skipping"
)
logging.debug(f"Error building attachment url: {e}")
continue
try:
response = convert_attachment_to_content(
confluence_client=self.confluence_client,
attachment=attachment,
page_id=page["id"],
allow_images=self.allow_images,
)
if response is None:
continue
file_storage_name, file_blob = response
if not file_blob:
logging.info("Skipping attachment because it is no blob fetched")
continue
# Build attachment-specific metadata
attachment_metadata: dict[str, str | list[str]] = {}
if "space" in attachment:
attachment_metadata["space"] = attachment["space"].get("name", "")
labels: list[str] = []
if "metadata" in attachment and "labels" in attachment["metadata"]:
for label in attachment["metadata"]["labels"].get("results", []):
labels.append(label.get("name", ""))
if labels:
attachment_metadata["labels"] = labels
page_url = page_url or build_confluence_document_id(
self.wiki_base, page["_links"]["webui"], self.is_cloud
)
attachment_metadata["parent_page_id"] = page_url
attachment_id = build_confluence_document_id(
self.wiki_base, attachment["_links"]["webui"], self.is_cloud
)
primary_owners: list[BasicExpertInfo] | None = None
if "version" in attachment and "by" in attachment["version"]:
author = attachment["version"]["by"]
display_name = author.get("displayName", "Unknown")
email = author.get("email", "unknown@domain.invalid")
primary_owners = [
BasicExpertInfo(display_name=display_name, email=email)
]
extension = Path(attachment.get("title", "")).suffix or ".unknown"
attachment_doc = Document(
id=attachment_id,
# sections=sections,
source=DocumentSource.CONFLUENCE,
semantic_identifier=attachment.get("title", object_url),
extension=extension,
blob=file_blob,
size_bytes=len(file_blob),
metadata=attachment_metadata,
doc_updated_at=(
datetime_from_string(attachment["version"]["when"])
if attachment.get("version")
and attachment["version"].get("when")
else None
),
primary_owners=primary_owners,
)
attachment_docs.append(attachment_doc)
except Exception as e:
logging.error(
f"Failed to extract/summarize attachment {attachment['title']}",
exc_info=e,
)
if is_atlassian_date_error(e):
# propagate error to be caught and retried
raise
attachment_failures.append(
ConnectorFailure(
failed_document=DocumentFailure(
document_id=object_url,
document_link=object_url,
),
failure_message=f"Failed to extract/summarize attachment {attachment['title']} for doc {object_url}",
exception=e,
)
)
return attachment_docs, attachment_failures
def _fetch_document_batches(
self,
checkpoint: ConfluenceCheckpoint,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> CheckpointOutput[ConfluenceCheckpoint]:
"""
Yields batches of Documents. For each page:
- Create a Document with 1 Section for the page text/comments
- Then fetch attachments. For each attachment:
- Attempt to convert it with convert_attachment_to_content(...)
- If successful, create a new Section with the extracted text or summary.
"""
checkpoint = copy.deepcopy(checkpoint)
# use "start" when last_updated is 0 or for confluence server
start_ts = start
page_query_url = checkpoint.next_page_url or self._build_page_retrieval_url(
start_ts, end, self.batch_size
)
logging.debug(f"page_query_url: {page_query_url}")
# store the next page start for confluence server, cursor for confluence cloud
def store_next_page_url(next_page_url: str) -> None:
checkpoint.next_page_url = next_page_url
for page in self.confluence_client.paginated_page_retrieval(
cql_url=page_query_url,
limit=self.batch_size,
next_page_callback=store_next_page_url,
):
# Build doc from page
doc_or_failure = self._convert_page_to_document(page)
if isinstance(doc_or_failure, ConnectorFailure):
yield doc_or_failure
continue
# yield completed document (or failure)
yield doc_or_failure
# Now get attachments for that page:
attachment_docs, attachment_failures = self._fetch_page_attachments(
page, start, end
)
# yield attached docs and failures
yield from attachment_docs
# yield from attachment_failures
# Create checkpoint once a full page of results is returned
if checkpoint.next_page_url and checkpoint.next_page_url != page_query_url:
return checkpoint
checkpoint.has_more = False
return checkpoint
def _build_page_retrieval_url(
self,
start: SecondsSinceUnixEpoch | None,
end: SecondsSinceUnixEpoch | None,
limit: int,
) -> str:
"""
Builds the full URL used to retrieve pages from the confluence API.
This can be used as input to the confluence client's _paginate_url
or paginated_page_retrieval methods.
"""
page_query = self._construct_page_cql_query(start, end)
cql_url = self.confluence_client.build_cql_url(
page_query, expand=",".join(_PAGE_EXPANSION_FIELDS)
)
return update_param_in_path(cql_url, "limit", str(limit))
@override
def load_from_checkpoint(
self,
start: SecondsSinceUnixEpoch,
end: SecondsSinceUnixEpoch,
checkpoint: ConfluenceCheckpoint,
) -> CheckpointOutput[ConfluenceCheckpoint]:
end += ONE_DAY # handle time zone weirdness
try:
return self._fetch_document_batches(checkpoint, start, end)
except Exception as e:
if is_atlassian_date_error(e) and start is not None:
logging.warning(
"Confluence says we provided an invalid 'updated' field. This may indicate"
"a real issue, but can also appear during edge cases like daylight"
f"savings time changes. Retrying with a 1 hour offset. Error: {e}"
)
return self._fetch_document_batches(checkpoint, start - ONE_HOUR, end)
raise
@override
def build_dummy_checkpoint(self) -> ConfluenceCheckpoint:
return ConfluenceCheckpoint(has_more=True, next_page_url=None)
@override
def validate_checkpoint_json(self, checkpoint_json: str) -> ConfluenceCheckpoint:
return ConfluenceCheckpoint.model_validate_json(checkpoint_json)
@override
def retrieve_all_slim_docs(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> GenerateSlimDocumentOutput:
return self._retrieve_all_slim_docs(
start=start,
end=end,
callback=callback,
include_permissions=False,
)
def retrieve_all_slim_docs_perm_sync(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> GenerateSlimDocumentOutput:
"""
Return 'slim' docs (IDs + minimal permission data).
Does not fetch actual text. Used primarily for incremental permission sync.
"""
return self._retrieve_all_slim_docs(
start=start,
end=end,
callback=callback,
include_permissions=True,
)
def _retrieve_all_slim_docs(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
callback: IndexingHeartbeatInterface | None = None,
include_permissions: bool = True,
) -> GenerateSlimDocumentOutput:
doc_metadata_list: list[SlimDocument] = []
restrictions_expand = ",".join(_RESTRICTIONS_EXPANSION_FIELDS)
space_level_access_info: dict[str, ExternalAccess] = {}
if include_permissions:
space_level_access_info = get_all_space_permissions(
self.confluence_client, self.is_cloud
)
def get_external_access(
doc_id: str, restrictions: dict[str, Any], ancestors: list[dict[str, Any]]
) -> ExternalAccess | None:
return get_page_restrictions(
self.confluence_client, doc_id, restrictions, ancestors
) or space_level_access_info.get(page_space_key)
# Query pages
page_query = self.base_cql_page_query + self.cql_label_filter
for page in self.confluence_client.cql_paginate_all_expansions(
cql=page_query,
expand=restrictions_expand,
limit=_SLIM_DOC_BATCH_SIZE,
):
page_id = page["id"]
page_restrictions = page.get("restrictions") or {}
page_space_key = page.get("space", {}).get("key")
page_ancestors = page.get("ancestors", [])
page_id = build_confluence_document_id(
self.wiki_base, page["_links"]["webui"], self.is_cloud
)
doc_metadata_list.append(
SlimDocument(
id=page_id,
external_access=(
get_external_access(page_id, page_restrictions, page_ancestors)
if include_permissions
else None
),
)
)
# Query attachments for each page
attachment_query = self._construct_attachment_query(page["id"])
for attachment in self.confluence_client.cql_paginate_all_expansions(
cql=attachment_query,
expand=restrictions_expand,
limit=_SLIM_DOC_BATCH_SIZE,
):
# If you skip images, you'll skip them in the permission sync
attachment["metadata"].get("mediaType", "")
if not validate_attachment_filetype(
attachment,
):
continue
attachment_restrictions = attachment.get("restrictions", {})
if not attachment_restrictions:
attachment_restrictions = page_restrictions or {}
attachment_space_key = attachment.get("space", {}).get("key")
if not attachment_space_key:
attachment_space_key = page_space_key
attachment_id = build_confluence_document_id(
self.wiki_base,
attachment["_links"]["webui"],
self.is_cloud,
)
doc_metadata_list.append(
SlimDocument(
id=attachment_id,
external_access=(
get_external_access(
attachment_id, attachment_restrictions, []
)
if include_permissions
else None
),
)
)
if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE:
yield doc_metadata_list[:_SLIM_DOC_BATCH_SIZE]
doc_metadata_list = doc_metadata_list[_SLIM_DOC_BATCH_SIZE:]
if callback and callback.should_stop():
raise RuntimeError(
"retrieve_all_slim_docs_perm_sync: Stop signal detected"
)
if callback:
callback.progress("retrieve_all_slim_docs_perm_sync", 1)
yield doc_metadata_list
def validate_connector_settings(self) -> None:
try:
spaces = self.low_timeout_confluence_client.get_all_spaces(limit=1)
except HTTPError as e:
status_code = e.response.status_code if e.response else None
if status_code == 401:
raise CredentialExpiredError(
"Invalid or expired Confluence credentials (HTTP 401)."
)
elif status_code == 403:
raise InsufficientPermissionsError(
"Insufficient permissions to access Confluence resources (HTTP 403)."
)
raise UnexpectedValidationError(
f"Unexpected Confluence error (status={status_code}): {e}"
)
except Exception as e:
raise UnexpectedValidationError(
f"Unexpected error while validating Confluence settings: {e}"
)
if self.space:
try:
self.low_timeout_confluence_client.get_space(self.space)
except ApiError as e:
raise ConnectorValidationError(
"Invalid Confluence space key provided"
) from e
if not spaces or not spaces.get("results"):
raise ConnectorValidationError(
"No Confluence spaces found. Either your credentials lack permissions, or "
"there truly are no spaces in this Confluence instance."
)
if __name__ == "__main__":
import os
# base url
wiki_base = os.environ["CONFLUENCE_URL"]
# auth stuff
username = os.environ["CONFLUENCE_USERNAME"]
access_token = os.environ["CONFLUENCE_ACCESS_TOKEN"]
is_cloud = os.environ["CONFLUENCE_IS_CLOUD"].lower() == "true"
# space + page
space = os.environ["CONFLUENCE_SPACE_KEY"]
# page_id = os.environ["CONFLUENCE_PAGE_ID"]
confluence_connector = ConfluenceConnector(
wiki_base=wiki_base,
space=space,
is_cloud=is_cloud,
# page_id=page_id,
)
credentials_provider = StaticCredentialsProvider(
None,
DocumentSource.CONFLUENCE,
{
"confluence_username": username,
"confluence_access_token": access_token,
},
)
confluence_connector.set_credentials_provider(credentials_provider)
start = 0.0
end = datetime.now().timestamp()
# Fetch all `SlimDocuments`.
for slim_doc in confluence_connector.retrieve_all_slim_docs_perm_sync():
print(slim_doc)
# Fetch all `Documents`.
for doc in load_all_docs_from_checkpoint_connector(
connector=confluence_connector,
start=start,
end=end,
):
print(doc)