mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-04 02:46:33 +00:00
### What problem does this PR solve? Refine Confluence connector. #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring
308 lines
8.0 KiB
Python
308 lines
8.0 KiB
Python
"""Data model definitions for all connectors"""
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
from typing import Any, Optional, List, Sequence, NamedTuple
|
|
from typing_extensions import TypedDict, NotRequired
|
|
from pydantic import BaseModel
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ExternalAccess:
|
|
|
|
# arbitrary limit to prevent excessively large permissions sets
|
|
# not internally enforced ... the caller can check this before using the instance
|
|
MAX_NUM_ENTRIES = 5000
|
|
|
|
# Emails of external users with access to the doc externally
|
|
external_user_emails: set[str]
|
|
# Names or external IDs of groups with access to the doc
|
|
external_user_group_ids: set[str]
|
|
# Whether the document is public in the external system or Onyx
|
|
is_public: bool
|
|
|
|
def __str__(self) -> str:
|
|
"""Prevent extremely long logs"""
|
|
|
|
def truncate_set(s: set[str], max_len: int = 100) -> str:
|
|
s_str = str(s)
|
|
if len(s_str) > max_len:
|
|
return f"{s_str[:max_len]}... ({len(s)} items)"
|
|
return s_str
|
|
|
|
return (
|
|
f"ExternalAccess("
|
|
f"external_user_emails={truncate_set(self.external_user_emails)}, "
|
|
f"external_user_group_ids={truncate_set(self.external_user_group_ids)}, "
|
|
f"is_public={self.is_public})"
|
|
)
|
|
|
|
@property
|
|
def num_entries(self) -> int:
|
|
return len(self.external_user_emails) + len(self.external_user_group_ids)
|
|
|
|
@classmethod
|
|
def public(cls) -> "ExternalAccess":
|
|
return cls(
|
|
external_user_emails=set(),
|
|
external_user_group_ids=set(),
|
|
is_public=True,
|
|
)
|
|
|
|
@classmethod
|
|
def empty(cls) -> "ExternalAccess":
|
|
"""
|
|
A helper function that returns an *empty* set of external user-emails and group-ids, and sets `is_public` to `False`.
|
|
This effectively makes the document in question "private" or inaccessible to anyone else.
|
|
|
|
This is especially helpful to use when you are performing permission-syncing, and some document's permissions aren't able
|
|
to be determined (for whatever reason). Setting its `ExternalAccess` to "private" is a feasible fallback.
|
|
"""
|
|
|
|
return cls(
|
|
external_user_emails=set(),
|
|
external_user_group_ids=set(),
|
|
is_public=False,
|
|
)
|
|
|
|
|
|
class ExtractionResult(NamedTuple):
|
|
"""Structured result from text and image extraction from various file types."""
|
|
|
|
text_content: str
|
|
embedded_images: Sequence[tuple[bytes, str]]
|
|
metadata: dict[str, Any]
|
|
|
|
|
|
class TextSection(BaseModel):
|
|
"""Text section model"""
|
|
link: str
|
|
text: str
|
|
|
|
|
|
class ImageSection(BaseModel):
|
|
"""Image section model"""
|
|
link: str
|
|
image_file_id: str
|
|
|
|
|
|
class Document(BaseModel):
|
|
"""Document model"""
|
|
id: str
|
|
source: str
|
|
semantic_identifier: str
|
|
extension: str
|
|
blob: bytes
|
|
doc_updated_at: datetime
|
|
size_bytes: int
|
|
|
|
|
|
class BasicExpertInfo(BaseModel):
|
|
"""Expert information model"""
|
|
display_name: Optional[str] = None
|
|
first_name: Optional[str] = None
|
|
last_name: Optional[str] = None
|
|
email: Optional[str] = None
|
|
|
|
def get_semantic_name(self) -> str:
|
|
"""Get semantic name for display"""
|
|
if self.display_name:
|
|
return self.display_name
|
|
elif self.first_name and self.last_name:
|
|
return f"{self.first_name} {self.last_name}"
|
|
elif self.first_name:
|
|
return self.first_name
|
|
elif self.last_name:
|
|
return self.last_name
|
|
else:
|
|
return "Unknown"
|
|
|
|
|
|
class SlimDocument(BaseModel):
|
|
"""Simplified document model (contains only ID and permission info)"""
|
|
id: str
|
|
external_access: Optional[Any] = None
|
|
|
|
|
|
class ConnectorCheckpoint(BaseModel):
|
|
"""Connector checkpoint model"""
|
|
has_more: bool = True
|
|
|
|
|
|
class DocumentFailure(BaseModel):
|
|
"""Document processing failure information"""
|
|
document_id: str
|
|
document_link: str
|
|
|
|
|
|
class EntityFailure(BaseModel):
|
|
"""Entity processing failure information"""
|
|
entity_id: str
|
|
missed_time_range: tuple[datetime, datetime]
|
|
|
|
|
|
class ConnectorFailure(BaseModel):
|
|
"""Connector failure information"""
|
|
failed_document: Optional[DocumentFailure] = None
|
|
failed_entity: Optional[EntityFailure] = None
|
|
failure_message: str
|
|
exception: Optional[Exception] = None
|
|
|
|
model_config = {"arbitrary_types_allowed": True}
|
|
|
|
|
|
# Gmail Models
|
|
class GmailCredentials(BaseModel):
|
|
"""Gmail authentication credentials model"""
|
|
primary_admin_email: str
|
|
credentials: dict[str, Any]
|
|
|
|
|
|
class GmailThread(BaseModel):
|
|
"""Gmail thread data model"""
|
|
id: str
|
|
messages: list[dict[str, Any]]
|
|
|
|
|
|
class GmailMessage(BaseModel):
|
|
"""Gmail message data model"""
|
|
id: str
|
|
payload: dict[str, Any]
|
|
label_ids: Optional[list[str]] = None
|
|
|
|
|
|
# Notion Models
|
|
class NotionPage(BaseModel):
|
|
"""Represents a Notion Page object"""
|
|
id: str
|
|
created_time: str
|
|
last_edited_time: str
|
|
archived: bool
|
|
properties: dict[str, Any]
|
|
url: str
|
|
database_name: Optional[str] = None # Only applicable to database type pages
|
|
|
|
|
|
class NotionBlock(BaseModel):
|
|
"""Represents a Notion Block object"""
|
|
id: str # Used for the URL
|
|
text: str
|
|
prefix: str # How this block should be joined with existing text
|
|
|
|
|
|
class NotionSearchResponse(BaseModel):
|
|
"""Represents the response from the Notion Search API"""
|
|
results: list[dict[str, Any]]
|
|
next_cursor: Optional[str]
|
|
has_more: bool = False
|
|
|
|
|
|
class NotionCredentials(BaseModel):
|
|
"""Notion authentication credentials model"""
|
|
integration_token: str
|
|
|
|
|
|
# Slack Models
|
|
class ChannelTopicPurposeType(TypedDict):
|
|
"""Slack channel topic or purpose"""
|
|
value: str
|
|
creator: str
|
|
last_set: int
|
|
|
|
|
|
class ChannelType(TypedDict):
|
|
"""Slack channel"""
|
|
id: str
|
|
name: str
|
|
is_channel: bool
|
|
is_group: bool
|
|
is_im: bool
|
|
created: int
|
|
creator: str
|
|
is_archived: bool
|
|
is_general: bool
|
|
unlinked: int
|
|
name_normalized: str
|
|
is_shared: bool
|
|
is_ext_shared: bool
|
|
is_org_shared: bool
|
|
pending_shared: List[str]
|
|
is_pending_ext_shared: bool
|
|
is_member: bool
|
|
is_private: bool
|
|
is_mpim: bool
|
|
updated: int
|
|
topic: ChannelTopicPurposeType
|
|
purpose: ChannelTopicPurposeType
|
|
previous_names: List[str]
|
|
num_members: int
|
|
|
|
|
|
class AttachmentType(TypedDict):
|
|
"""Slack message attachment"""
|
|
service_name: NotRequired[str]
|
|
text: NotRequired[str]
|
|
fallback: NotRequired[str]
|
|
thumb_url: NotRequired[str]
|
|
thumb_width: NotRequired[int]
|
|
thumb_height: NotRequired[int]
|
|
id: NotRequired[int]
|
|
|
|
|
|
class BotProfileType(TypedDict):
|
|
"""Slack bot profile"""
|
|
id: NotRequired[str]
|
|
deleted: NotRequired[bool]
|
|
name: NotRequired[str]
|
|
updated: NotRequired[int]
|
|
app_id: NotRequired[str]
|
|
team_id: NotRequired[str]
|
|
|
|
|
|
class MessageType(TypedDict):
|
|
"""Slack message"""
|
|
type: str
|
|
user: str
|
|
text: str
|
|
ts: str
|
|
attachments: NotRequired[List[AttachmentType]]
|
|
bot_id: NotRequired[str]
|
|
app_id: NotRequired[str]
|
|
bot_profile: NotRequired[BotProfileType]
|
|
thread_ts: NotRequired[str]
|
|
subtype: NotRequired[str]
|
|
|
|
|
|
# Thread message list
|
|
ThreadType = List[MessageType]
|
|
|
|
|
|
class SlackCheckpoint(TypedDict):
|
|
"""Slack checkpoint"""
|
|
channel_ids: List[str] | None
|
|
channel_completion_map: dict[str, str]
|
|
current_channel: ChannelType | None
|
|
current_channel_access: Any | None
|
|
seen_thread_ts: List[str]
|
|
has_more: bool
|
|
|
|
|
|
class SlackMessageFilterReason(str):
|
|
"""Slack message filter reason"""
|
|
BOT = "bot"
|
|
DISALLOWED = "disallowed"
|
|
|
|
|
|
class ProcessedSlackMessage:
|
|
"""Processed Slack message"""
|
|
def __init__(self, doc=None, thread_or_message_ts=None, filter_reason=None, failure=None):
|
|
self.doc = doc
|
|
self.thread_or_message_ts = thread_or_message_ts
|
|
self.filter_reason = filter_reason
|
|
self.failure = failure
|
|
|
|
|
|
# Type aliases for type hints
|
|
SecondsSinceUnixEpoch = float
|
|
GenerateDocumentsOutput = Any
|
|
GenerateSlimDocumentOutput = Any
|
|
CheckpointOutput = Any |