olmocr/pdelfin/filter/coherency.py

# Uses a premade kenLM filter trained on good DCLM filtered web data to help identify pdfs where the 
# content has been very poorly parsed
import kenlm

from functools import lru_cache
from cached_path import cached_path

KENLM_S3_PATH = "s3://ai2-oe-data/jakep/kenlm-dclm/5gramtok.bin"

@lru_cache()
def load_kenlm():
    local_path = cached_path(KENLM_S3_PATH)
    model = kenlm.Model(local_path)

    return model


def get_document_coherency(text: str) -> float:
    model = load_kenlm()

    return model.score(text)
Moving a whole bunch of code over, still broken 2024-09-17 16:26:55 +00:00			`# Uses a premade kenLM filter trained on good DCLM filtered web data to help identify pdfs where the`
			`# content has been very poorly parsed`
			`import kenlm`

			`from functools import lru_cache`
			`from cached_path import cached_path`

			`KENLM_S3_PATH = "s3://ai2-oe-data/jakep/kenlm-dclm/5gramtok.bin"`

			`@lru_cache()`
			`def load_kenlm():`
			`local_path = cached_path(KENLM_S3_PATH)`
			`model = kenlm.Model(local_path)`

			`return model`


			`def get_document_coherency(text: str) -> float:`
			`model = load_kenlm()`

			`return model.score(text)`