diff --git a/olmocr/bench/miners/mine_headers_footers.py b/olmocr/bench/miners/mine_headers_footers.py index 3c4ec09..601fd41 100644 --- a/olmocr/bench/miners/mine_headers_footers.py +++ b/olmocr/bench/miners/mine_headers_footers.py @@ -132,7 +132,7 @@ def detect_headers_footers(pdf_path: str, page_num: int, api_key: str) -> Option parts=[ image_part, types.Part.from_text( - text="""Please tell me which text in this image is part of any headers/footers and would therefore be skipped it someone were reading it outloud to another person. Include page numbers and document-level headers and footers, but not inner subsections.""" + text="""Please extract and display the complete text from the document without omission. Include all sections and ensure nothing is summarized or abbreviated. I want the entire text of the document at any cost. Do not hallucinate""" ), ], ), diff --git a/olmocr/bench/miners/mine_multi_column.py b/olmocr/bench/miners/mine_multi_column.py new file mode 100644 index 0000000..6ddc94c --- /dev/null +++ b/olmocr/bench/miners/mine_multi_column.py @@ -0,0 +1,407 @@ +""" +mine_multi_column.py - Extract text from PDF documents which has multiple columns. + +This script: +1. Takes a file containing folder path which contains PDF documents as input +2. Process each PDF to generate an HTML representation +3. For each PDF, it renders to an image +4. Uses Claude Sonnet to identify text from multiple columns in the rendered image +5. Creates a test file asserting that the order (before/after) of text should appear +6. Extracts the page from the PDF and saves it to an output folder + +Usage: + python mine_headers_footers.py --input_dir path/to/pdfs --output_dir path/to/output --api_key your_anthropic_api_key +""" + +import argparse +import asyncio +import concurrent.futures +import json +import os +import random +import re +import subprocess +import uuid +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List + +import pypdf +from anthropic import Anthropic +from bs4 import BeautifulSoup +from playwright.async_api import async_playwright +from syntok.segmenter import process +from tqdm import tqdm + +from olmocr.data.renderpdf import ( + get_png_dimensions_from_base64, + render_pdf_to_base64png, +) + + +def extract_code_block(initial_response): + html_blocks = re.findall(r"```html\n(.*?)```", initial_response, re.DOTALL) + if html_blocks: + return html_blocks[-1].strip() + code_blocks = re.findall(r"```\n(.*?)```", initial_response, re.DOTALL) + if code_blocks: + return code_blocks[-1].strip() + html_blocks_no_newline = re.findall(r"```html(.*?)```", initial_response, re.DOTALL) + if html_blocks_no_newline: + return html_blocks_no_newline[-1].strip() + code_blocks_no_newline = re.findall(r"```(.*?)```", initial_response, re.DOTALL) + if code_blocks_no_newline: + return code_blocks_no_newline[-1].strip() + return None + + +def generate_html_from_image(client, image_base64): + """Call Claude API to generate HTML from an image using a multi-step prompting strategy.""" + png_width, png_height = get_png_dimensions_from_base64(image_base64) + try: + analysis_response = client.messages.create( + model="claude-3-7-sonnet-20250219", + max_tokens=2000, + temperature=0.1, + messages=[ + { + "role": "user", + "content": [ + {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}}, + { + "type": "text", + "text": ( + "Analyze this document and provide a detailed assessment of its structure. " + "Focus on the layout, headings, footers, and any complex formatting. Please be precise." + ), + }, + ], + } + ], + ) + + analysis_text = "" + for content in analysis_response.content: + if content.type == "text": + analysis_text += content.text + + initial_response = client.messages.create( + model="claude-3-7-sonnet-20250219", + max_tokens=6000, + temperature=0.2, + messages=[ + { + "role": "user", + "content": [ + {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}}, + { + "type": "text", + "text": ( + "Render this document as clean, semantic HTML. Here is the analysis of the document structure:\n\n" + f"{analysis_text}\n\n" + "Requirements:\n" + "1. Use appropriate HTML tags for headings, paragraphs, and lists.\n" + "2. Use
and