Moving pdf filter code over with tests

2025-12-10 14:51:10 +00:00 · 2024-09-17 15:16:58 +00:00 · 2024-09-17 15:16:58 +00:00 · a534a0180d
commit a534a0180d
parent 9662718bfd
15 changed files with 195 additions and 4 deletions
--- a/pdelfin/datatypes.py
+++ b/pdelfin/datatypes.py
@ -0,0 +1,33 @@
+import json
+import hashlib
+import datetime
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class PdfOutput:
+    path: str
+    text: str
+    total_pdf_pages: int
+    processed_pdf_pages: int
+
+    def mk_dolma_doc(self, **kwargs) -> str:
+        metadata = {
+            "Source-File": self.path,
+            "pdf-pages": self.processed_pdf_pages,
+            "pdf-total-pages": self.total_pdf_pages,
+            # Kwargs are added as extra metadata
+            **kwargs,
+        }
+        id_ = hashlib.sha1(self.text.encode()).hexdigest()
+
+        dolma_doc = {
+            "id": id_,
+            "text": self.text,
+            "source": "s2pdf",
+            "added": datetime.datetime.now().strftime("%Y-%m-%d"),
+            "created": datetime.datetime.now().strftime("%Y-%m-%d"),
+            "metadata": metadata,
+        }
+
+        return json.dumps(dolma_doc)
--- a/pdelfin/filter.py
+++ b/pdelfin/filter.py
@ -0,0 +1,147 @@
+import csv
+import datetime
+import hashlib
+import json
+import logging
+import math
+import re
+import subprocess
+from collections import Counter
+from dataclasses import dataclass
+from io import StringIO
+
+import requests
+from lingua import Language, LanguageDetectorBuilder
+from pypdf import PdfReader
+from pypdf.errors import DependencyError, PyPdfError
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+class PdfFilter:
+    def __init__(self):
+        super().__init__()
+        self.language_detector = (
+            LanguageDetectorBuilder.from_all_languages()
+            .with_preloaded_language_models()
+            .build()
+        )
+        self.ngram_log_probs = self._build_ngram_log_probs()
+
+    # Used for comparing frequency of words to eliminate bad documents
+    def _build_ngram_log_probs(self):
+        NGRAM_DATASET_LINK = "https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/lucas/google-1T-unigram/unigram_freq.csv"
+
+        ngrams = {}
+
+        # Download the dataset
+        response = requests.get(NGRAM_DATASET_LINK)
+        if response.status_code != 200:
+            raise Exception(
+                f"Failed to download data, status code: {response.status_code}"
+            )
+
+        # Read the CSV content
+        csv_content = StringIO(response.text)
+        reader = csv.DictReader(csv_content)
+
+        # Build the frequency dictionary
+        total_count = 0
+
+        for row in reader:
+            word = row["word"]
+            count = int(row["count"])
+            total_count += count
+            ngrams[word] = count
+
+        # Convert to log probs
+        return {word: math.log(count / total_count) for word, count in ngrams.items()}
+
+    def _is_form(self, local_pdf_path: str) -> bool:
+        # Remove PDFs which are forms
+        try:
+            pdf_reader = PdfReader(local_pdf_path)
+            if pdf_reader.get_form_text_fields():
+                return True
+        except PyPdfError as pex:
+            logger.exception(pex)
+            logger.warning("Invalid PDF, filtering out")
+            return False
+        except DependencyError as dex:
+            logger.warning(f"PDF requires external dependency {dex}, filtering out")
+            return False
+        except Exception as ex:
+            logger.exception(ex)
+            logger.warning(f"Internal error reading PDF, filtering out")
+            return False
+
+        # TODO: If distribution of _ characters is very high, it's probably a form
+
+    def _is_download_spam(self, base_text: str, threshold: float = 0.004) -> bool:
+        seo_words = {
+            "download",
+            "pdf",
+            "epub",
+            "mobi",
+            "free",
+            "ebook",
+            "file",
+            "save",
+            "casino",
+        }
+        seo_word_probs = {word: self.ngram_log_probs[word] for word in seo_words}
+
+        base_text = base_text.strip().lower()
+        clean_text = re.sub(r"\W+", " ", base_text)
+
+        word_counts = Counter(clean_text.split())
+        total_words = len(clean_text.split())
+
+        seo_score = sum(word_counts[word] for word in seo_words if word in word_counts)
+
+        return seo_score / total_words > threshold
+
+    # Returns True if there is something wrong with this PDF
+    def filter_out_pdf(self, local_pdf_path: str) -> bool:
+        # Basic metadata-level filtering
+        if self._is_form(local_pdf_path):
+            return False
+
+        # Read the first five pages of text for language calculation
+        pdftotext_result = subprocess.run(
+            ["pdftotext", "-f", "1", "-l", "5", local_pdf_path, "-"],
+            timeout=60,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        if pdftotext_result.returncode != 0:
+            logger.warn(
+                f"pdftotext returned {pdftotext_result.returncode} on {local_pdf_path}"
+            )
+            return False
+
+        base_text = pdftotext_result.stdout.decode("utf-8")
+
+        # Other filter ideas:
+        #  - Remove patents, they tend to be ocred, multicolumn, and should come in through a cleaner dataset
+        #  - Detect things with too many figures
+        #  - Detect too many pages with no input
+        #  - Off distribution in terms of words per page, etc
+        if len(base_text) < 100 or len(base_text.split()) < 50:
+            logger.warn("PDF is too short, skipping")
+            return False
+
+        language = self.language_detector.detect_language_of(base_text)
+
+        if language != Language.ENGLISH:
+            logger.info(
+                f"Filtering out {local_pdf_path} because language was {language}"
+            )
+            return True
+
+        if self._is_download_spam(base_text):
+            logger.info(f"Filtering out {local_pdf_path} because of SEO/download spam")
+            return True
+
+        return False
--- a/pyproject.toml
+++ b/pyproject.toml
@ -19,7 +19,8 @@ authors = [
 ]
 requires-python = ">=3.8"
 dependencies = [
-  # Add your own dependencies here
+  "pypdf",
+  "lingua-language-detector"
 ]
 license = {file = "LICENSE"}

@ -71,7 +72,7 @@ pdelfin = ["py.typed"]
 version = {attr = "pdelfin.version.VERSION"}

 [tool.black]
-line-length = 100
+line-length = 140
 include = '\.pyi?$'
 exclude = '''
 (
--- a/tests/gnarly_pdfs/form_on_later_pages.pdf
+++ b/tests/gnarly_pdfs/form_on_later_pages.pdf
--- a/tests/gnarly_pdfs/handwriting_bad_ocr.pdf
+++ b/tests/gnarly_pdfs/handwriting_bad_ocr.pdf
--- a/tests/gnarly_pdfs/instructions_and_schematics.pdf
+++ b/tests/gnarly_pdfs/instructions_and_schematics.pdf
--- a/tests/gnarly_pdfs/lots_of_chem_tables.pdf
+++ b/tests/gnarly_pdfs/lots_of_chem_tables.pdf
--- a/tests/gnarly_pdfs/lots_of_sci_tables.pdf
+++ b/tests/gnarly_pdfs/lots_of_sci_tables.pdf
--- a/tests/gnarly_pdfs/most_content_in_image_form.pdf
+++ b/tests/gnarly_pdfs/most_content_in_image_form.pdf
--- a/tests/gnarly_pdfs/pdftotext_two_column_issue.pdf
+++ b/tests/gnarly_pdfs/pdftotext_two_column_issue.pdf
--- a/tests/gnarly_pdfs/slideshow_mostly_good_some_pages_should_get_filtered.pdf
+++ b/tests/gnarly_pdfs/slideshow_mostly_good_some_pages_should_get_filtered.pdf
--- a/tests/gnarly_pdfs/slideshow_mostly_images.pdf
+++ b/tests/gnarly_pdfs/slideshow_mostly_images.pdf
--- a/tests/gnarly_pdfs/some_ocr1.pdf
+++ b/tests/gnarly_pdfs/some_ocr1.pdf
--- a/tests/hello_test.py
+++ b/tests/hello_test.py
@ -1,2 +0,0 @@
-def test_hello():
-    print("Hello, World!")
--- a/tests/test_filter.py
+++ b/tests/test_filter.py
@ -0,0 +1,12 @@
+import unittest
+import os
+
+from pdelfin.filter import PdfFilter
+
+class PdfFilterTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self.filter = PdfFilter()
+
+    def testFormLaterPages(self):
+        self.assertTrue(self.filter._is_form(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf")))
+