More tests

2025-12-13 16:21:16 +00:00 · 2025-04-02 18:39:51 +00:00 · 2025-04-02 18:39:51 +00:00 · 97376493fd
commit 97376493fd
parent 748ab95751
2 changed files with 73 additions and 55 deletions
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@ -257,7 +257,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L
                text = tag.get_text().strip()
                if text:
                    text_elements.append(text)
-                    
+
        # If no elements found, use the parent's text as a fallback
        if not text_elements:
            parent_text = parent_element.get_text().strip()
@ -274,7 +274,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L
                        "id": f"{pdf_id}_{element_type}_{uuid.uuid4().hex[:8]}",
                        "type": TestType.ABSENT.value,
                        "text": text,
-                        "max_diffs": 5,
+                        "max_diffs": 0,
                    }
                )

@ -297,7 +297,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L
                    "id": f"{pdf_id}_page_number_{uuid.uuid4().hex[:8]}",
                    "type": TestType.ABSENT.value,
                    "text": page_number_text,
-                    "max_diffs": 5,
+                    "max_diffs": 0,
                }
            )

@ -392,68 +392,56 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int) -> L
    main_soup = BeautifulSoup(str(soup), "html.parser")

    # Remove headers, footers, and tables from the main_soup
-    for element in main_soup.find_all(["header", "footer", "table"]):
+    for element in main_soup.find_all(["header", "footer", "table", "head"]):
        element.extract()

    # Get all paragraphs and headings in the main content
    paragraphs = main_soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"])

-    # Sample a few paragraphs to use for presence tests
-    if paragraphs:
-        sampled_paragraphs = random.sample(paragraphs, min(5, len(paragraphs)))
+    full_text = main_soup.get_text().strip()

-        for paragraph in sampled_paragraphs:
-            text = paragraph.get_text().strip()
-            # Only create tests for paragraphs with sufficient content
-            if text and len(text) > 20:
-                tests.append(
-                    {
-                        "pdf": pdf_filename,
-                        "page": page_num,
-                        "id": f"{pdf_id}_text_{uuid.uuid4().hex[:8]}",
-                        "type": TestType.PRESENT.value,
-                        "text": text[:200],  # Limit to 200 chars to keep tests manageable
-                        "max_diffs": 10,
-                    }
-                )
+    sentences = []
+    for paragraph in process(full_text):
+        for sentence in paragraph:
+            # Convert token sequence to string and clean it
+            sentence_str = ""
+            for token in sentence:
+                sentence_str += token.spacing + token.value

-    # Generate some TextOrderTests for content that should appear in a specific order
-    if len(paragraphs) >= 2:
-        # Extract all text from the main content
-        all_text = " ".join([p.get_text().strip() for p in paragraphs])
+            sentence_str = sentence_str.strip()

-        # Use syntok to segment the text into sentences
-        sentences = []
-        for paragraph in process(all_text):
-            for sentence in paragraph:
-                # Convert token sequence to string and clean it
-                sentence_text = " ".join([token.value for token in sentence]).strip()
-                if sentence_text and len(sentence_text) > 10 and len(sentence_text) < 100:
-                    sentences.append(sentence_text)
+            if sentence_str:
+                sentences.append(sentence_str)

-        # Create TextOrderTests from pairs of sentences that are at least 3 sentences apart
-        # to ensure they're from different parts of the document
-        if len(sentences) >= 5:
-            num_tests = min(3, len(sentences) // 5)
-            for _ in range(num_tests):
-                # Get two random indices with sufficient distance between them
-                i = random.randint(0, len(sentences) - 4)
-                j = random.randint(i + 3, min(i + 10, len(sentences) - 1))
+    # Add a few random ordering tests
+    all_indexes = list(range(len(sentences)))
+    
+    # Ex. pick N pairs of indexes from all_indexes
+    random_pairs = set()
+    for _ in range(10):
+        idx1, idx2 = random.sample(all_indexes, 2)
+        if idx1 > idx2:
+            idx1, idx2 = idx2, idx1
+        random_pairs.add((idx1, idx2))

-                first_sentence = sentences[i]
-                second_sentence = sentences[j]
+    for i, j in random_pairs:
+        first_sentence = sentences[i]
+        second_sentence = sentences[j]

-                tests.append(
-                    {
-                        "pdf": pdf_filename,
-                        "page": page_num,
-                        "id": f"{pdf_id}_order_{uuid.uuid4().hex[:8]}",
-                        "type": TestType.ORDER.value,
-                        "before": first_sentence,
-                        "after": second_sentence,
-                        "max_diffs": 10,
-                    }
-                )
+        if len(first_sentence) < 10 or len(second_sentence) < 10:
+            continue
+
+        tests.append(
+            {
+                "pdf": pdf_filename,
+                "page": page_num,
+                "id": f"{pdf_id}_order_{uuid.uuid4().hex[:8]}",
+                "type": TestType.ORDER.value,
+                "before": first_sentence,
+                "after": second_sentence,
+                "max_diffs": round(max(len(first_sentence), len(second_sentence)) * 0.05),
+            }
+        )

    return tests

--- a/olmocr/bench/synth/test_mine.py
+++ b/olmocr/bench/synth/test_mine.py
@ -41,4 +41,34 @@ class TestMineTests(unittest.TestCase):
 """
        tests = generate_tests_from_html(html_content, "0", 1)

-        self.assertEqual(len([test for test in tests if test["type"]=="absent"]), 2)
+        self.assertEqual(len([test for test in tests if test["type"]=="absent"]), 2)
+
+    def test_text_basic(self):
+        html_content = """
+
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Bone Morphology Description</title>
+</head>
+<body>
+    <main>
+        <p>The posterior end exhibits a curved process to articulate with the angular. Aside from the process, the rest of the posterior end has slight curvatures for articulation, but is mostly uniform. Ventral border of the bone is mostly straight, with slight curvature (FIG. 20).</p>
+        
+        <p><span class="section-heading">Lateral</span>- A spine runs from the anterior-most tip, reduces in height ~3/4 way down toward posterior, and terminates at the center of the posterior notch. A fossa is present on the dorsal side of the spine. The posterior end exhibits more relief than in medial view, with the medial side of the posterior process extending past the posterior notch.</p>
+        
+        <p><span class="section-heading">Ontogeny</span>- Anterior tip is sharply pointed in AR12 and AR1 with AR2 being rounded, though this could be due to breakage. Anterior dorsal margin is straight and flat in AR12; AR2 shows little curvature and AR1 shows the most curvature; curving outward dorsally. Dorsal incisure is anteroposteriorly oriented in AR12, in AR2 there is some ventral curvature, and in AR1 there is a posteroventral curvature. Both AR1 and AR3 are curved on the ventral margin while AR12 is mostly straight. Posterior end of AR1 exhibits four undulations, ventral process is not yet extended. A fossa is present dorsal to the ventral process, not seen on AR12 or AR2. In medial view the lateral ridge is visible posteriorly in AR1 and AR2l the ridge does not fully extend anteriorly. In lateral view of the posterior the ventral process is present on AR2, but not fully extended posteriorly. Tip of the anterodorsal process is sharply pointed in AR1 and AR2, rounded in AR12. A short ridge is present on the dorsal edge of the dorsal process of AR1. The short ridge on the posterodorsal process of AR2 is slightly more ventral than in AR1. On AR12 the ridge is long and positioned most ventral. The lateral ridge is closest to the ventral margin in AR1. In AR2 the ridge is positioned more dorsally and in AR12 the ridge terminates and the anterior tip. The section of bone ventral to the lateral ridge appears to thin with age. The posterior notch on AR12 is curved anteriorly and the medial side of the notch extends posteriorly</p>
+    </main>
+    
+    <footer>
+        <p>46</p>
+    </footer>
+</body>
+</html>"""
+
+        tests = generate_tests_from_html(html_content, "0", 1)
+
+        for test in tests:
+            print(test)