mirror of
				https://github.com/langgenius/dify.git
				synced 2025-10-25 16:08:45 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			80 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			80 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from abc import ABC, abstractmethod
 | |
| from collections.abc import Sequence
 | |
| from typing import Any, Optional
 | |
| 
 | |
| from pydantic import BaseModel, Field
 | |
| 
 | |
| 
 | |
| class Document(BaseModel):
 | |
|     """Class for storing a piece of text and associated metadata."""
 | |
| 
 | |
|     page_content: str
 | |
| 
 | |
|     """Arbitrary metadata about the page content (e.g., source, relationships to other
 | |
|         documents, etc.).
 | |
|     """
 | |
|     metadata: Optional[dict] = Field(default_factory=dict)
 | |
| 
 | |
| 
 | |
| class BaseDocumentTransformer(ABC):
 | |
|     """Abstract base class for document transformation systems.
 | |
| 
 | |
|     A document transformation system takes a sequence of Documents and returns a
 | |
|     sequence of transformed Documents.
 | |
| 
 | |
|     Example:
 | |
|         .. code-block:: python
 | |
| 
 | |
|             class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
 | |
|                 embeddings: Embeddings
 | |
|                 similarity_fn: Callable = cosine_similarity
 | |
|                 similarity_threshold: float = 0.95
 | |
| 
 | |
|                 class Config:
 | |
|                     arbitrary_types_allowed = True
 | |
| 
 | |
|                 def transform_documents(
 | |
|                     self, documents: Sequence[Document], **kwargs: Any
 | |
|                 ) -> Sequence[Document]:
 | |
|                     stateful_documents = get_stateful_documents(documents)
 | |
|                     embedded_documents = _get_embeddings_from_stateful_docs(
 | |
|                         self.embeddings, stateful_documents
 | |
|                     )
 | |
|                     included_idxs = _filter_similar_embeddings(
 | |
|                         embedded_documents, self.similarity_fn, self.similarity_threshold
 | |
|                     )
 | |
|                     return [stateful_documents[i] for i in sorted(included_idxs)]
 | |
| 
 | |
|                 async def atransform_documents(
 | |
|                     self, documents: Sequence[Document], **kwargs: Any
 | |
|                 ) -> Sequence[Document]:
 | |
|                     raise NotImplementedError
 | |
| 
 | |
|     """
 | |
| 
 | |
|     @abstractmethod
 | |
|     def transform_documents(
 | |
|         self, documents: Sequence[Document], **kwargs: Any
 | |
|     ) -> Sequence[Document]:
 | |
|         """Transform a list of documents.
 | |
| 
 | |
|         Args:
 | |
|             documents: A sequence of Documents to be transformed.
 | |
| 
 | |
|         Returns:
 | |
|             A list of transformed Documents.
 | |
|         """
 | |
| 
 | |
|     @abstractmethod
 | |
|     async def atransform_documents(
 | |
|         self, documents: Sequence[Document], **kwargs: Any
 | |
|     ) -> Sequence[Document]:
 | |
|         """Asynchronously transform a list of documents.
 | |
| 
 | |
|         Args:
 | |
|             documents: A sequence of Documents to be transformed.
 | |
| 
 | |
|         Returns:
 | |
|             A list of transformed Documents.
 | |
|         """
 | 
