Yongteng Lei 465a140727
Feat: refine Confluence connector (#10994)
### What problem does this PR solve?

Refine Confluence connector.
#10953

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
2025-11-04 17:29:11 +08:00

308 lines
8.0 KiB
Python

"""Data model definitions for all connectors"""
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Optional, List, Sequence, NamedTuple
from typing_extensions import TypedDict, NotRequired
from pydantic import BaseModel
@dataclass(frozen=True)
class ExternalAccess:
# arbitrary limit to prevent excessively large permissions sets
# not internally enforced ... the caller can check this before using the instance
MAX_NUM_ENTRIES = 5000
# Emails of external users with access to the doc externally
external_user_emails: set[str]
# Names or external IDs of groups with access to the doc
external_user_group_ids: set[str]
# Whether the document is public in the external system or Onyx
is_public: bool
def __str__(self) -> str:
"""Prevent extremely long logs"""
def truncate_set(s: set[str], max_len: int = 100) -> str:
s_str = str(s)
if len(s_str) > max_len:
return f"{s_str[:max_len]}... ({len(s)} items)"
return s_str
return (
f"ExternalAccess("
f"external_user_emails={truncate_set(self.external_user_emails)}, "
f"external_user_group_ids={truncate_set(self.external_user_group_ids)}, "
f"is_public={self.is_public})"
)
@property
def num_entries(self) -> int:
return len(self.external_user_emails) + len(self.external_user_group_ids)
@classmethod
def public(cls) -> "ExternalAccess":
return cls(
external_user_emails=set(),
external_user_group_ids=set(),
is_public=True,
)
@classmethod
def empty(cls) -> "ExternalAccess":
"""
A helper function that returns an *empty* set of external user-emails and group-ids, and sets `is_public` to `False`.
This effectively makes the document in question "private" or inaccessible to anyone else.
This is especially helpful to use when you are performing permission-syncing, and some document's permissions aren't able
to be determined (for whatever reason). Setting its `ExternalAccess` to "private" is a feasible fallback.
"""
return cls(
external_user_emails=set(),
external_user_group_ids=set(),
is_public=False,
)
class ExtractionResult(NamedTuple):
"""Structured result from text and image extraction from various file types."""
text_content: str
embedded_images: Sequence[tuple[bytes, str]]
metadata: dict[str, Any]
class TextSection(BaseModel):
"""Text section model"""
link: str
text: str
class ImageSection(BaseModel):
"""Image section model"""
link: str
image_file_id: str
class Document(BaseModel):
"""Document model"""
id: str
source: str
semantic_identifier: str
extension: str
blob: bytes
doc_updated_at: datetime
size_bytes: int
class BasicExpertInfo(BaseModel):
"""Expert information model"""
display_name: Optional[str] = None
first_name: Optional[str] = None
last_name: Optional[str] = None
email: Optional[str] = None
def get_semantic_name(self) -> str:
"""Get semantic name for display"""
if self.display_name:
return self.display_name
elif self.first_name and self.last_name:
return f"{self.first_name} {self.last_name}"
elif self.first_name:
return self.first_name
elif self.last_name:
return self.last_name
else:
return "Unknown"
class SlimDocument(BaseModel):
"""Simplified document model (contains only ID and permission info)"""
id: str
external_access: Optional[Any] = None
class ConnectorCheckpoint(BaseModel):
"""Connector checkpoint model"""
has_more: bool = True
class DocumentFailure(BaseModel):
"""Document processing failure information"""
document_id: str
document_link: str
class EntityFailure(BaseModel):
"""Entity processing failure information"""
entity_id: str
missed_time_range: tuple[datetime, datetime]
class ConnectorFailure(BaseModel):
"""Connector failure information"""
failed_document: Optional[DocumentFailure] = None
failed_entity: Optional[EntityFailure] = None
failure_message: str
exception: Optional[Exception] = None
model_config = {"arbitrary_types_allowed": True}
# Gmail Models
class GmailCredentials(BaseModel):
"""Gmail authentication credentials model"""
primary_admin_email: str
credentials: dict[str, Any]
class GmailThread(BaseModel):
"""Gmail thread data model"""
id: str
messages: list[dict[str, Any]]
class GmailMessage(BaseModel):
"""Gmail message data model"""
id: str
payload: dict[str, Any]
label_ids: Optional[list[str]] = None
# Notion Models
class NotionPage(BaseModel):
"""Represents a Notion Page object"""
id: str
created_time: str
last_edited_time: str
archived: bool
properties: dict[str, Any]
url: str
database_name: Optional[str] = None # Only applicable to database type pages
class NotionBlock(BaseModel):
"""Represents a Notion Block object"""
id: str # Used for the URL
text: str
prefix: str # How this block should be joined with existing text
class NotionSearchResponse(BaseModel):
"""Represents the response from the Notion Search API"""
results: list[dict[str, Any]]
next_cursor: Optional[str]
has_more: bool = False
class NotionCredentials(BaseModel):
"""Notion authentication credentials model"""
integration_token: str
# Slack Models
class ChannelTopicPurposeType(TypedDict):
"""Slack channel topic or purpose"""
value: str
creator: str
last_set: int
class ChannelType(TypedDict):
"""Slack channel"""
id: str
name: str
is_channel: bool
is_group: bool
is_im: bool
created: int
creator: str
is_archived: bool
is_general: bool
unlinked: int
name_normalized: str
is_shared: bool
is_ext_shared: bool
is_org_shared: bool
pending_shared: List[str]
is_pending_ext_shared: bool
is_member: bool
is_private: bool
is_mpim: bool
updated: int
topic: ChannelTopicPurposeType
purpose: ChannelTopicPurposeType
previous_names: List[str]
num_members: int
class AttachmentType(TypedDict):
"""Slack message attachment"""
service_name: NotRequired[str]
text: NotRequired[str]
fallback: NotRequired[str]
thumb_url: NotRequired[str]
thumb_width: NotRequired[int]
thumb_height: NotRequired[int]
id: NotRequired[int]
class BotProfileType(TypedDict):
"""Slack bot profile"""
id: NotRequired[str]
deleted: NotRequired[bool]
name: NotRequired[str]
updated: NotRequired[int]
app_id: NotRequired[str]
team_id: NotRequired[str]
class MessageType(TypedDict):
"""Slack message"""
type: str
user: str
text: str
ts: str
attachments: NotRequired[List[AttachmentType]]
bot_id: NotRequired[str]
app_id: NotRequired[str]
bot_profile: NotRequired[BotProfileType]
thread_ts: NotRequired[str]
subtype: NotRequired[str]
# Thread message list
ThreadType = List[MessageType]
class SlackCheckpoint(TypedDict):
"""Slack checkpoint"""
channel_ids: List[str] | None
channel_completion_map: dict[str, str]
current_channel: ChannelType | None
current_channel_access: Any | None
seen_thread_ts: List[str]
has_more: bool
class SlackMessageFilterReason(str):
"""Slack message filter reason"""
BOT = "bot"
DISALLOWED = "disallowed"
class ProcessedSlackMessage:
"""Processed Slack message"""
def __init__(self, doc=None, thread_or_message_ts=None, filter_reason=None, failure=None):
self.doc = doc
self.thread_or_message_ts = thread_or_message_ts
self.filter_reason = filter_reason
self.failure = failure
# Type aliases for type hints
SecondsSinceUnixEpoch = float
GenerateDocumentsOutput = Any
GenerateSlimDocumentOutput = Any
CheckpointOutput = Any