mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-03 11:35:29 +00:00
filtering out stupid ads
This commit is contained in:
parent
6ef8226347
commit
549e07bed0
@ -85,6 +85,9 @@ class PdfFilter:
|
|||||||
"file",
|
"file",
|
||||||
"save",
|
"save",
|
||||||
"casino",
|
"casino",
|
||||||
|
"viagra",
|
||||||
|
"cialis",
|
||||||
|
"ciprofloxacin",
|
||||||
}
|
}
|
||||||
seo_word_probs = {word: self.ngram_log_probs[word] for word in seo_words}
|
seo_word_probs = {word: self.ngram_log_probs[word] for word in seo_words}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user