mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-10-31 01:55:06 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			29 lines
		
	
	
		
			815 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			29 lines
		
	
	
		
			815 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import Type
 | |
| 
 | |
| from spacy.lang.en import English
 | |
| 
 | |
| from .registry import BaseRegistry
 | |
| 
 | |
| 
 | |
| class SegmenterRegistry(BaseRegistry[Type["BaseSegmenter"]]):
 | |
|     """A registry for segmenters."""
 | |
| 
 | |
| 
 | |
| class BaseSegmenter:
 | |
|     def __init__(self, segmenter_name_or_path: str, *args, **kwargs):
 | |
|         super().__init__()
 | |
| 
 | |
|     def segment(self, text: str) -> list[str]:
 | |
|         raise NotImplementedError()
 | |
| 
 | |
| 
 | |
| @SegmenterRegistry.add("spacy")
 | |
| class SpacySegmenter(BaseSegmenter):
 | |
|     def __init__(self, segmenter_name_or_path: str, *args, **kwargs):
 | |
|         assert segmenter_name_or_path == "spacy", "Only 'spacy' segmenter is supported"
 | |
|         self.nlp = English()
 | |
|         self.nlp.add_pipe("sentencizer")
 | |
| 
 | |
|     def segment(self, text: str) -> list[str]:
 | |
|         return [sent.text_with_ws for sent in self.nlp(text).sents]
 | 
