Even more test cleanup

2025-11-03 03:25:22 +00:00 · 2025-08-22 18:56:56 +00:00 · 2025-08-22 18:56:56 +00:00 · 0df56e958e
commit 0df56e958e
parent 9831e65161
2 changed files with 20 additions and 4 deletions
--- a/olmocr/bench/miners/download_math.py
+++ b/olmocr/bench/miners/download_math.py
@ -56,7 +56,7 @@ def download_and_extract_source(paper_id, data_dir):


 def download_pdf(paper_id, data_dir):
-    pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
+    pdf_url = f"https://export.arxiv.org/pdf/{paper_id}.pdf"
    print(f"Downloading PDF for {paper_id} from {pdf_url}...")
    response = requests.get(pdf_url)
    if response.status_code != 200:
@ -105,7 +105,7 @@ def main():
            if os.path.exists(tex_path):
                os.remove(tex_path)
                print(f"Removed tex file for {paper_id} because PDF download failed.")
-        time.sleep(1)
+        time.sleep(3)


 if __name__ == "__main__":
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@ -800,12 +800,22 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
                if sentence_str:
                    # Skip HTML table content that might still be in markdown
                    if not sentence_str.startswith('<') and not sentence_str.endswith('>'):
+                        # Skip image placeholders - match any markdown image syntax ![...](...)
+                        if re.search(r'!\[.*?\]\(.*?\)', sentence_str):
+                            continue
+                        
                        # Remove leading # marks (markdown headers)
                        while sentence_str.startswith('#'):
                            sentence_str = sentence_str[1:]
                        sentence_str = sentence_str.strip()
                        
-                        if sentence_str:  # Only add if there's still content after removing #
+                        # Remove leading "- " for unordered lists
+                        if sentence_str.startswith('- '):
+                            sentence_str = sentence_str[2:]
+                        
+                        sentence_str = sentence_str.strip()
+                        
+                        if sentence_str:  # Only add if there's still content after cleaning
                            sentences.append(sentence_str)

    # Add a few random ordering tests
@ -1108,7 +1118,13 @@ def process_pdf(pdf_info, args, client, pdf_filter=None):
                png_width, png_height = get_png_dimensions_from_base64(image_base64)

                # Run the async function in the synchronous context
-                render_success = asyncio.run(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
+                # Create a new event loop to avoid conflicts
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+                try:
+                    render_success = loop.run_until_complete(render_pdf_with_playwright(html_content, playwright_pdf_path, png_width, png_height))
+                finally:
+                    loop.close()

                if render_success:
                    print(f"Successfully rendered with Playwright: {playwright_pdf_path}")