olmocr/pdelfin/filter/filter.py

110 lines
3.6 KiB
Python
Raw Normal View History

2024-09-17 15:16:58 +00:00
import logging
import re
import subprocess
from collections import Counter
from lingua import Language, LanguageDetectorBuilder
from pypdf import PdfReader
from pypdf.errors import DependencyError, PyPdfError
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class PdfFilter:
2024-10-17 22:36:38 +00:00
def __init__(
self,
languages_to_keep=None,
apply_form_check=True,
apply_download_spam_check=True,
download_spam_threshold=0.004,
):
2024-09-17 15:16:58 +00:00
super().__init__()
2024-10-17 22:36:38 +00:00
self.language_detector = (
LanguageDetectorBuilder.from_all_languages()
.with_preloaded_language_models()
.build()
2024-09-18 22:52:42 +00:00
)
2024-10-17 22:36:38 +00:00
self.languages_to_keep = (
languages_to_keep if languages_to_keep is not None else [Language.ENGLISH]
)
self.apply_form_check = apply_form_check
self.apply_download_spam_check = apply_download_spam_check
self.download_spam_threshold = download_spam_threshold
2024-09-17 15:16:58 +00:00
2024-10-17 22:36:38 +00:00
def _is_form(self, pdf_reader) -> bool:
# Check if the PDF is a form
if pdf_reader.get_form_text_fields():
return True
return False # Not a form
2024-09-17 15:16:58 +00:00
2024-10-17 22:36:38 +00:00
def _is_download_spam(self, base_text: str) -> bool:
2024-09-17 15:16:58 +00:00
seo_words = {
"download",
"pdf",
"epub",
"mobi",
"free",
"ebook",
"file",
"save",
"casino",
2024-10-02 15:36:41 +00:00
"viagra",
"cialis",
"ciprofloxacin",
2024-09-17 15:16:58 +00:00
}
base_text = base_text.strip().lower()
clean_text = re.sub(r"\W+", " ", base_text)
word_counts = Counter(clean_text.split())
total_words = len(clean_text.split())
seo_score = sum(word_counts[word] for word in seo_words if word in word_counts)
2024-10-17 22:36:38 +00:00
return (seo_score / total_words) > self.download_spam_threshold
2024-09-17 15:16:58 +00:00
# Returns True if there is something wrong with this PDF
def filter_out_pdf(self, local_pdf_path: str) -> bool:
2024-10-17 22:36:38 +00:00
try:
# Attempt to read the PDF at the beginning
pdf_reader = PdfReader(local_pdf_path)
except Exception as e:
logger.warning(f"Error reading PDF {local_pdf_path}: {e}")
return True # Filter out the PDF if an exception occurs
# Form check
if self.apply_form_check and self._is_form(pdf_reader):
logger.info(f"Filtering out {local_pdf_path} because it's a form")
return True # Filter out
2024-09-17 15:16:58 +00:00
# Read the first five pages of text for language calculation
pdftotext_result = subprocess.run(
["pdftotext", "-f", "1", "-l", "5", local_pdf_path, "-"],
timeout=60,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if pdftotext_result.returncode != 0:
2024-10-17 22:36:38 +00:00
logger.warning(
f"pdftotext returned {pdftotext_result.returncode} on {local_pdf_path}"
)
return True # Filter out
2024-09-17 15:16:58 +00:00
base_text = pdftotext_result.stdout.decode("utf-8")
2024-10-17 22:36:38 +00:00
# Language check
2024-09-17 15:16:58 +00:00
language = self.language_detector.detect_language_of(base_text)
2024-10-17 22:36:38 +00:00
if language not in self.languages_to_keep:
logger.info(
f"Filtering out {local_pdf_path} because language was {language}"
)
return True # Filter out
# Download spam check
if self.apply_download_spam_check and self._is_download_spam(base_text):
2024-09-17 15:16:58 +00:00
logger.info(f"Filtering out {local_pdf_path} because of SEO/download spam")
2024-10-17 22:36:38 +00:00
return True # Filter out
2024-09-17 15:16:58 +00:00
2024-10-17 22:36:38 +00:00
return False # Keep the PDF