mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-01 18:43:45 +00:00
filtering out stupid ads
This commit is contained in:
parent
6ef8226347
commit
549e07bed0
@ -85,6 +85,9 @@ class PdfFilter:
|
||||
"file",
|
||||
"save",
|
||||
"casino",
|
||||
"viagra",
|
||||
"cialis",
|
||||
"ciprofloxacin",
|
||||
}
|
||||
seo_word_probs = {word: self.ngram_log_probs[word] for word in seo_words}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user