From d4d87f7c65a12c80eae5cc60573901cbbfdb0f99 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Thu, 3 Apr 2025 20:27:01 +0000
Subject: [PATCH] Force flag for review app, tests fixed for difference
 comparison in tables

---
 olmocr/bench/review_app.py | 43 +++++++++++++++++++-------------------
 olmocr/bench/test_tests.py | 18 ++++++++++++++++
 olmocr/bench/tests.py      | 21 ++++++++++---------
 3 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/olmocr/bench/review_app.py b/olmocr/bench/review_app.py
index 9c00fcb..e07e9c6 100644
--- a/olmocr/bench/review_app.py
+++ b/olmocr/bench/review_app.py
@@ -18,7 +18,7 @@ DATASET_FILE = None
 CURRENT_PDF = None
 PDF_TESTS = {}
 ALL_PDFS = []
-
+FORCE = False  # New global flag
 
 def find_next_unchecked_pdf() -> Optional[str]:
     """Find the next PDF with at least one unchecked test."""
@@ -31,7 +31,6 @@ def find_next_unchecked_pdf() -> Optional[str]:
                 return pdf_name
     return None
 
-
 def calculate_stats() -> dict:
     """Calculate statistics for all tests in the dataset."""
     global PDF_TESTS
@@ -59,7 +58,6 @@ def calculate_stats() -> dict:
 
     return {"total": total_tests, "null": null_status, "verified": verified_status, "rejected": rejected_status, "completion": completion}
 
-
 def save_dataset(jsonl_file: str) -> None:
     """Save the tests to a JSONL file, using temp file for atomic write."""
     global PDF_TESTS
@@ -77,26 +75,27 @@ def save_dataset(jsonl_file: str) -> None:
     # Atomic replace
     shutil.move(temp_file.name, jsonl_file)
 
-
 @app.route("/pdf/<path:pdf_name>")
 def serve_pdf(pdf_name):
     """Serve the PDF file directly."""
     pdf_path = os.path.join(DATASET_DIR, "pdfs", pdf_name)
     return send_file(pdf_path, mimetype="application/pdf")
 
-
 @app.route("/")
 def index():
     """Main page displaying the current PDF and its tests."""
-    global CURRENT_PDF, PDF_TESTS, DATASET_DIR
+    global CURRENT_PDF, PDF_TESTS, DATASET_DIR, ALL_PDFS, FORCE
 
     # If no current PDF is set, find the next one with unchecked tests
     if CURRENT_PDF is None:
         CURRENT_PDF = find_next_unchecked_pdf()
 
-    # If still no PDF, all tests have been checked
+    # If still no PDF, either show the "All done" page or force display the first PDF
     if CURRENT_PDF is None:
-        return render_template("all_done.html")
+        if FORCE and ALL_PDFS:
+            CURRENT_PDF = ALL_PDFS[0]
+        else:
+            return render_template("all_done.html")
 
     # Get the tests for the current PDF
     current_tests = PDF_TESTS.get(CURRENT_PDF, [])
@@ -117,7 +116,6 @@ def index():
         stats=stats,
     )
 
-
 @app.route("/update_test", methods=["POST"])
 def update_test():
     """API endpoint to update a test."""
@@ -140,7 +138,6 @@ def update_test():
 
     return jsonify({"status": "success"})
 
-
 @app.route("/reject_all", methods=["POST"])
 def reject_all():
     """API endpoint to reject all tests for a PDF."""
@@ -161,24 +158,29 @@ def reject_all():
 
     return jsonify({"status": "error", "message": "PDF not found"})
 
-
 @app.route("/next_pdf", methods=["POST"])
 def next_pdf():
     """Move to the next PDF in the list."""
-    global CURRENT_PDF, ALL_PDFS
+    global CURRENT_PDF, ALL_PDFS, FORCE
 
     if CURRENT_PDF in ALL_PDFS:
         current_index = ALL_PDFS.index(CURRENT_PDF)
         if current_index < len(ALL_PDFS) - 1:
             CURRENT_PDF = ALL_PDFS[current_index + 1]
         else:
-            CURRENT_PDF = find_next_unchecked_pdf()
+            # If in force mode, cycle back to the beginning instead of checking for an unchecked PDF
+            if FORCE and ALL_PDFS:
+                CURRENT_PDF = ALL_PDFS[0]
+            else:
+                CURRENT_PDF = find_next_unchecked_pdf()
     else:
-        CURRENT_PDF = find_next_unchecked_pdf()
+        if FORCE and ALL_PDFS:
+            CURRENT_PDF = ALL_PDFS[0]
+        else:
+            CURRENT_PDF = find_next_unchecked_pdf()
 
     return redirect(url_for("index"))
 
-
 @app.route("/prev_pdf", methods=["POST"])
 def prev_pdf():
     """Move to the previous PDF in the list."""
@@ -191,7 +193,6 @@ def prev_pdf():
 
     return redirect(url_for("index"))
 
-
 @app.route("/goto_pdf/<int:index>", methods=["POST"])
 def goto_pdf(index):
     """Go to a specific PDF by index."""
@@ -202,7 +203,6 @@ def goto_pdf(index):
 
     return redirect(url_for("index"))
 
-
 def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
     """Load tests from the dataset file and organize them by PDF."""
     if not os.path.exists(dataset_file):
@@ -228,24 +228,24 @@ def load_dataset(dataset_file: str) -> Tuple[Dict[str, List[Dict]], List[str]]:
 
     return pdf_tests, all_pdfs
 
-
 def create_templates_directory():
     """Create templates directory for Flask if it doesn't exist."""
     templates_dir = os.path.join(os.path.dirname(__file__), "templates")
     os.makedirs(templates_dir, exist_ok=True)
 
-
 def main():
     """Main entry point with command-line arguments."""
-    global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF
+    global DATASET_DIR, DATASET_FILE, PDF_TESTS, ALL_PDFS, CURRENT_PDF, FORCE
 
     parser = argparse.ArgumentParser(description="Interactive Test Review App")
     parser.add_argument("dataset_file", help="Path to the dataset jsonl file")
     parser.add_argument("--port", type=int, default=5000, help="Port for the Flask app")
     parser.add_argument("--host", default="127.0.0.1", help="Host for the Flask app")
     parser.add_argument("--debug", action="store_true", help="Run Flask in debug mode")
-
+    parser.add_argument("--force", action="store_true", help="Force show each file one by one and never do the 'All done' page")
+    
     args = parser.parse_args()
+    FORCE = args.force  # Set the global FORCE flag
 
     # Validate dataset directory
     if not os.path.exists(args.dataset_file):
@@ -280,6 +280,5 @@ def main():
 
     return 0
 
-
 if __name__ == "__main__":
     sys.exit(main())
diff --git a/olmocr/bench/test_tests.py b/olmocr/bench/test_tests.py
index d5f89e3..5b7da6d 100644
--- a/olmocr/bench/test_tests.py
+++ b/olmocr/bench/test_tests.py
@@ -600,6 +600,24 @@ Some text before the table.
         result, explanation = test.run(table)
         self.assertTrue(result, explanation)
 
+    def test_diffs(self):
+        table = """| CATEGORY     | POINTS EARNED |
+|------------------------------|------------------|
+| Sustainable Sites            | 9                |
+| Water Efficiency             | 3                |
+| Energy & Atmosphere          | 12               |
+| Materials & Resources        | 6                |
+| Indoor Environmental Quality | 11               |
+| Innovation & Design Process  | 5                |
+| TOTAL                        | 46               |"""
+        test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="9", left="Sustl Sie", max_diffs=2)
+        result, explanation = test.run(table)
+        self.assertFalse(result, explanation)
+
+        test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="9", left="Sustainable Site", max_diffs=2)
+        result, explanation = test.run(table)
+        self.assertTrue(result, explanation)
+
     def test_markdown_marker2(self):
         table = """| Concentration
 level | [CO]      | [SO2] | [NOx]    |
diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py
index e06f7d1..aea2774 100644
--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@@ -658,6 +658,7 @@ class TableTest(BasePDFTest):
 
         # Threshold for fuzzy matching derived from max_diffs
         threshold = 1.0 - (self.max_diffs / (len(self.cell) if len(self.cell) > 0 else 1))
+        threshold = max(0.5, threshold)
 
         # Parse tables based on content_type
         md_tables = parse_markdown_tables(content)
@@ -700,7 +701,7 @@ class TableTest(BasePDFTest):
                 if self.up and row_idx > 0:
                     up_cell = normalize_text(table_array[row_idx - 1, col_idx])
                     up_similarity = fuzz.ratio(self.up, up_cell) / 100.0
-                    if up_similarity < threshold:
+                    if up_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.up) if len(self.up) > 0 else 1))):
                         all_relationships_satisfied = False
                         current_failed_reasons.append(f"Cell above '{up_cell}' doesn't match expected '{self.up}' (similarity: {up_similarity:.2f})")
 
@@ -708,7 +709,7 @@ class TableTest(BasePDFTest):
                 if self.down and row_idx < table_array.shape[0] - 1:
                     down_cell = normalize_text(table_array[row_idx + 1, col_idx])
                     down_similarity = fuzz.ratio(self.down, down_cell) / 100.0
-                    if down_similarity < threshold:
+                    if down_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.down) if len(self.down) > 0 else 1))):
                         all_relationships_satisfied = False
                         current_failed_reasons.append(f"Cell below '{down_cell}' doesn't match expected '{self.down}' (similarity: {down_similarity:.2f})")
 
@@ -716,7 +717,7 @@ class TableTest(BasePDFTest):
                 if self.left and col_idx > 0:
                     left_cell = normalize_text(table_array[row_idx, col_idx - 1])
                     left_similarity = fuzz.ratio(self.left, left_cell) / 100.0
-                    if left_similarity < threshold:
+                    if left_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.left) if len(self.left) > 0 else 1))):
                         all_relationships_satisfied = False
                         current_failed_reasons.append(
                             f"Cell to the left '{left_cell}' doesn't match expected '{self.left}' (similarity: {left_similarity:.2f})"
@@ -726,7 +727,7 @@ class TableTest(BasePDFTest):
                 if self.right and col_idx < table_array.shape[1] - 1:
                     right_cell = normalize_text(table_array[row_idx, col_idx + 1])
                     right_similarity = fuzz.ratio(self.right, right_cell) / 100.0
-                    if right_similarity < threshold:
+                    if right_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.right) if len(self.right) > 0 else 1))):
                         all_relationships_satisfied = False
                         current_failed_reasons.append(
                             f"Cell to the right '{right_cell}' doesn't match expected '{self.right}' (similarity: {right_similarity:.2f})"
@@ -747,7 +748,7 @@ class TableTest(BasePDFTest):
                             if similarity > best_similarity:
                                 best_similarity = similarity
                                 best_match = header_text
-                                if best_similarity >= threshold:
+                                if best_similarity >= max(0.5, 1.0 - (self.max_diffs / (len(self.top_heading) if len(self.top_heading) > 0 else 1))):
                                     top_heading_found = True
                                     break
 
@@ -760,7 +761,7 @@ class TableTest(BasePDFTest):
                                 if similarity > best_similarity:
                                     best_similarity = similarity
                                     best_match = header_text
-                                    if best_similarity >= threshold:
+                                    if best_similarity >= max(0.5, 1.0 - (self.max_diffs / (len(self.top_heading) if len(self.top_heading) > 0 else 1))):
                                         top_heading_found = True
                                         break
 
@@ -777,7 +778,7 @@ class TableTest(BasePDFTest):
                     if not best_match:
                         all_relationships_satisfied = False
                         current_failed_reasons.append(f"No top heading found for cell at ({row_idx}, {col_idx})")
-                    elif best_similarity < threshold:
+                    elif best_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.top_heading) if len(self.top_heading) > 0 else 1))):
                         all_relationships_satisfied = False
                         current_failed_reasons.append(
                             f"Top heading '{best_match}' doesn't match expected '{self.top_heading}' (similarity: {best_similarity:.2f})"
@@ -798,7 +799,7 @@ class TableTest(BasePDFTest):
                             if similarity > best_similarity:
                                 best_similarity = similarity
                                 best_match = header_text
-                                if best_similarity >= threshold:
+                                if best_similarity >= max(0.5, 1.0 - (self.max_diffs / (len(self.left_heading) if len(self.left_heading) > 0 else 1))):
                                     left_heading_found = True
                                     break
 
@@ -811,7 +812,7 @@ class TableTest(BasePDFTest):
                                 if similarity > best_similarity:
                                     best_similarity = similarity
                                     best_match = header_text
-                                    if best_similarity >= threshold:
+                                    if best_similarity >= max(0.5, 1.0 - (self.max_diffs / (len(self.left_heading) if len(self.left_heading) > 0 else 1))):
                                         left_heading_found = True
                                         break
 
@@ -828,7 +829,7 @@ class TableTest(BasePDFTest):
                     if not best_match:
                         all_relationships_satisfied = False
                         current_failed_reasons.append(f"No left heading found for cell at ({row_idx}, {col_idx})")
-                    elif best_similarity < threshold:
+                    elif best_similarity < max(0.5, 1.0 - (self.max_diffs / (len(self.left_heading) if len(self.left_heading) > 0 else 1))):
                         all_relationships_satisfied = False
                         current_failed_reasons.append(
                             f"Left heading '{best_match}' doesn't match expected '{self.left_heading}' (similarity: {best_similarity:.2f})"