From 4d837b7db2fc3eda742ba8b64b1323d0771df7b2 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 19 Aug 2025 20:01:42 +0000 Subject: [PATCH] More filter rules --- olmocr/train/dataloader.py | 123 ++++++++++++++++++++++++++++++++++--- 1 file changed, 114 insertions(+), 9 deletions(-) diff --git a/olmocr/train/dataloader.py b/olmocr/train/dataloader.py index e9612d9..33a23a3 100644 --- a/olmocr/train/dataloader.py +++ b/olmocr/train/dataloader.py @@ -528,6 +528,8 @@ class DatasetTextRuleFilter(PipelineStep): - Contain markdown tables - Contain malformed HTML tables - Contain
tags within HTML table cells + - Contain math equations that fail to render + - Contain mathematical symbols (∈, ∉, ⊂, ⊃, ⊆, ⊇, ∅, ∪, ∩, ∀, ∃, ¬) outside of table cells """ def _contains_markdown_table(self, text: str) -> bool: @@ -552,6 +554,101 @@ class DatasetTextRuleFilter(PipelineStep): return True return False + def _contains_math_symbols(self, text: str) -> bool: + """Check if text contains specific mathematical symbols outside of table cells. + + Returns: + True if text contains any of the specified math symbols outside tables + False otherwise + """ + import re + + # List of mathematical symbols to check for + math_symbols = [ + # Set theory and logic + '∈', '∉', '⊂', '⊃', '⊆', '⊇', '∅', '∪', '∩', '∀', '∃', '¬', + # Common mathematical operators + '⊕', '⊗', '⊙', + # Calculus and analysis + '∂', '∇', '∆', '∫', '∬', '∭', '∮', '∏', '∑', '√', '∛', '∜', + # Arrows and relations + '⊥', + # Other common math symbols + '∠', '∡', '⊤', '⊢', '⊣', '∴', '∵', '∶', '∷', '∝', '≅', '≆', '≇', '≊', '≋', + # Matrix and vector notation + '⊕', '⊖', '⊗', '⊘', '⊙', '⊚', '⊛', '⊜', '⊝', + ] + + # First, remove all HTML tables from the text + text_without_tables = text + + # Remove HTML tables + table_pattern = re.compile(r']*>.*?', re.IGNORECASE | re.DOTALL) + text_without_tables = table_pattern.sub('', text_without_tables) + + # Now check if any of these symbols appear in the text without tables + for symbol in math_symbols: + if symbol in text_without_tables: + return True + + return False + + def _validate_math_equations(self, text: str) -> bool: + """Check if all math equations in the text can render without errors. + + Returns: + True if all equations render successfully or no equations exist + False if any equation fails to render + """ + import re + + # Patterns to find math equations (same as in MathTest) + patterns = [ + r"\$\$(.+?)\$\$", # $$...$$ + r"\\\((.+?)\\\)", # \(...\) + r"\\\[(.+?)\\\]", # \[...\] + ] + + equations = [] + for pattern in patterns: + # Find all matches for the current pattern + matches = re.findall(pattern, text, re.DOTALL) + equations.extend([eq.strip() for eq in matches]) + + # If no equations found, that's fine + if not equations: + return True + + # Try to render each equation + try: + from olmocr.bench.katex.render import render_equation + + for equation in equations: + # Skip empty or whitespace-only equations + if not equation or not equation.strip(): + continue + + # Try to render the equation + rendered = render_equation(equation) + + # Check if there was an error + if rendered is None or (hasattr(rendered, 'error') and rendered.error): + # Equation failed to render + logger.warning(f"Could not render equation '{repr(equation)}', skipping sample") + return False + + # All equations rendered successfully + return True + + except ImportError: + # If we can't import the render module, skip this check + # This allows the filter to work even without the rendering dependencies + return True + except Exception as e: + # If any unexpected error occurs during validation, be conservative and filter out + print(f"Error validating math equations: {e}") + return False + def _contains_br_in_table_cells(self, text: str) -> bool: """Check if text contains
tags within HTML table cells. @@ -682,17 +779,25 @@ class DatasetTextRuleFilter(PipelineStep): if text is None: return sample - # Check for markdown tables - if self._contains_markdown_table(text): - return None # Filter out samples with markdown tables + # # Check for markdown tables + # if self._contains_markdown_table(text): + # return None # Filter out samples with markdown tables - # Check for HTML tables and validate them - if not self._extract_and_validate_html_tables(text): - return None # Filter out samples with malformed HTML tables + # # Check for HTML tables and validate them + # if not self._extract_and_validate_html_tables(text): + # return None # Filter out samples with malformed HTML tables - # Check for
tags in table cells - if self._contains_br_in_table_cells(text): - return None # Filter out samples with
tags in table cells + # # Check for
tags in table cells + # if self._contains_br_in_table_cells(text): + # return None # Filter out samples with
tags in table cells + + # # Check if all math equations can render without errors + # if not self._validate_math_equations(text): + # return None # Filter out samples with invalid math equations + + # Check for mathematical symbols + if self._contains_math_symbols(text): + return None # Filter out samples with mathematical symbols return sample