mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-11 16:22:29 +00:00
29 lines
815 B
Python
29 lines
815 B
Python
from typing import Type
|
|
|
|
from spacy.lang.en import English
|
|
|
|
from .registry import BaseRegistry
|
|
|
|
|
|
class SegmenterRegistry(BaseRegistry[Type["BaseSegmenter"]]):
|
|
"""A registry for segmenters."""
|
|
|
|
|
|
class BaseSegmenter:
|
|
def __init__(self, segmenter_name_or_path: str, *args, **kwargs):
|
|
super().__init__()
|
|
|
|
def segment(self, text: str) -> list[str]:
|
|
raise NotImplementedError()
|
|
|
|
|
|
@SegmenterRegistry.add("spacy")
|
|
class SpacySegmenter(BaseSegmenter):
|
|
def __init__(self, segmenter_name_or_path: str, *args, **kwargs):
|
|
assert segmenter_name_or_path == "spacy", "Only 'spacy' segmenter is supported"
|
|
self.nlp = English()
|
|
self.nlp.add_pipe("sentencizer")
|
|
|
|
def segment(self, text: str) -> list[str]:
|
|
return [sent.text_with_ws for sent in self.nlp(text).sents]
|