Moving pdf filter code over with tests

This commit is contained in:
Jake Poznanski 2024-09-17 15:16:58 +00:00
parent 9662718bfd
commit a534a0180d
15 changed files with 195 additions and 4 deletions

33
pdelfin/datatypes.py Normal file
View File

@ -0,0 +1,33 @@
import json
import hashlib
import datetime
from dataclasses import dataclass
@dataclass(frozen=True)
class PdfOutput:
path: str
text: str
total_pdf_pages: int
processed_pdf_pages: int
def mk_dolma_doc(self, **kwargs) -> str:
metadata = {
"Source-File": self.path,
"pdf-pages": self.processed_pdf_pages,
"pdf-total-pages": self.total_pdf_pages,
# Kwargs are added as extra metadata
**kwargs,
}
id_ = hashlib.sha1(self.text.encode()).hexdigest()
dolma_doc = {
"id": id_,
"text": self.text,
"source": "s2pdf",
"added": datetime.datetime.now().strftime("%Y-%m-%d"),
"created": datetime.datetime.now().strftime("%Y-%m-%d"),
"metadata": metadata,
}
return json.dumps(dolma_doc)

147
pdelfin/filter.py Normal file
View File

@ -0,0 +1,147 @@
import csv
import datetime
import hashlib
import json
import logging
import math
import re
import subprocess
from collections import Counter
from dataclasses import dataclass
from io import StringIO
import requests
from lingua import Language, LanguageDetectorBuilder
from pypdf import PdfReader
from pypdf.errors import DependencyError, PyPdfError
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
class PdfFilter:
def __init__(self):
super().__init__()
self.language_detector = (
LanguageDetectorBuilder.from_all_languages()
.with_preloaded_language_models()
.build()
)
self.ngram_log_probs = self._build_ngram_log_probs()
# Used for comparing frequency of words to eliminate bad documents
def _build_ngram_log_probs(self):
NGRAM_DATASET_LINK = "https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/lucas/google-1T-unigram/unigram_freq.csv"
ngrams = {}
# Download the dataset
response = requests.get(NGRAM_DATASET_LINK)
if response.status_code != 200:
raise Exception(
f"Failed to download data, status code: {response.status_code}"
)
# Read the CSV content
csv_content = StringIO(response.text)
reader = csv.DictReader(csv_content)
# Build the frequency dictionary
total_count = 0
for row in reader:
word = row["word"]
count = int(row["count"])
total_count += count
ngrams[word] = count
# Convert to log probs
return {word: math.log(count / total_count) for word, count in ngrams.items()}
def _is_form(self, local_pdf_path: str) -> bool:
# Remove PDFs which are forms
try:
pdf_reader = PdfReader(local_pdf_path)
if pdf_reader.get_form_text_fields():
return True
except PyPdfError as pex:
logger.exception(pex)
logger.warning("Invalid PDF, filtering out")
return False
except DependencyError as dex:
logger.warning(f"PDF requires external dependency {dex}, filtering out")
return False
except Exception as ex:
logger.exception(ex)
logger.warning(f"Internal error reading PDF, filtering out")
return False
# TODO: If distribution of _ characters is very high, it's probably a form
def _is_download_spam(self, base_text: str, threshold: float = 0.004) -> bool:
seo_words = {
"download",
"pdf",
"epub",
"mobi",
"free",
"ebook",
"file",
"save",
"casino",
}
seo_word_probs = {word: self.ngram_log_probs[word] for word in seo_words}
base_text = base_text.strip().lower()
clean_text = re.sub(r"\W+", " ", base_text)
word_counts = Counter(clean_text.split())
total_words = len(clean_text.split())
seo_score = sum(word_counts[word] for word in seo_words if word in word_counts)
return seo_score / total_words > threshold
# Returns True if there is something wrong with this PDF
def filter_out_pdf(self, local_pdf_path: str) -> bool:
# Basic metadata-level filtering
if self._is_form(local_pdf_path):
return False
# Read the first five pages of text for language calculation
pdftotext_result = subprocess.run(
["pdftotext", "-f", "1", "-l", "5", local_pdf_path, "-"],
timeout=60,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if pdftotext_result.returncode != 0:
logger.warn(
f"pdftotext returned {pdftotext_result.returncode} on {local_pdf_path}"
)
return False
base_text = pdftotext_result.stdout.decode("utf-8")
# Other filter ideas:
# - Remove patents, they tend to be ocred, multicolumn, and should come in through a cleaner dataset
# - Detect things with too many figures
# - Detect too many pages with no input
# - Off distribution in terms of words per page, etc
if len(base_text) < 100 or len(base_text.split()) < 50:
logger.warn("PDF is too short, skipping")
return False
language = self.language_detector.detect_language_of(base_text)
if language != Language.ENGLISH:
logger.info(
f"Filtering out {local_pdf_path} because language was {language}"
)
return True
if self._is_download_spam(base_text):
logger.info(f"Filtering out {local_pdf_path} because of SEO/download spam")
return True
return False

View File

@ -19,7 +19,8 @@ authors = [
]
requires-python = ">=3.8"
dependencies = [
# Add your own dependencies here
"pypdf",
"lingua-language-detector"
]
license = {file = "LICENSE"}
@ -71,7 +72,7 @@ pdelfin = ["py.typed"]
version = {attr = "pdelfin.version.VERSION"}
[tool.black]
line-length = 100
line-length = 140
include = '\.pyi?$'
exclude = '''
(

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,2 +0,0 @@
def test_hello():
print("Hello, World!")

12
tests/test_filter.py Normal file
View File

@ -0,0 +1,12 @@
import unittest
import os
from pdelfin.filter import PdfFilter
class PdfFilterTest(unittest.TestCase):
def setUp(self) -> None:
self.filter = PdfFilter()
def testFormLaterPages(self):
self.assertTrue(self.filter._is_form(os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "form_on_later_pages.pdf")))