From c96143c3b171bff5beaa172609d693b079836db3 Mon Sep 17 00:00:00 2001 From: Francisco Bischoff Date: Fri, 4 Apr 2025 12:52:58 +0100 Subject: [PATCH 1/4] Add script to convert JSONL files to Markdown format --- scripts/jsonl_to_markdown.py | 58 ++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 scripts/jsonl_to_markdown.py diff --git a/scripts/jsonl_to_markdown.py b/scripts/jsonl_to_markdown.py new file mode 100644 index 0000000..6c99ea1 --- /dev/null +++ b/scripts/jsonl_to_markdown.py @@ -0,0 +1,58 @@ +import json +import os +import sys + +# This is a simple script to convert JSONL files to Markdown format. +# It reads each line of the JSONL file, extracts the 'text' field, +# and saves it as a Markdown file with the line number as the filename. +# The script also handles potential JSON decoding errors and prints relevant messages. +def jsonl_to_markdown(input_file, output_dir): + """ + Reads a JSONL file, extracts the 'text' field from each line, and saves it as a Markdown file. + + Args: + input_file (str): Path to the input JSONL file. + output_dir (str): Directory to save the Markdown files. + """ + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + with open(input_file, 'r', encoding='utf-8') as file: + for i, line in enumerate(file): + try: + # Parse the JSON line + data = json.loads(line) + text_content = data.get("text", "") + + # Convert to Markdown format + markdown_content = f"# Extracted Content (Line {i + 1})\n\n{text_content}" + + # Save to a Markdown file + output_file = os.path.join(output_dir, f"line_{i + 1}.md") + with open(output_file, 'w', encoding='utf-8') as md_file: + md_file.write(markdown_content) + + print(f"Extracted and saved line {i + 1} to {output_file}") + except json.JSONDecodeError as e: + print(f"Error decoding JSON on line {i + 1}: {e}") + except Exception as e: + print(f"Unexpected error on line {i + 1}: {e}") + +# Example usage +# input_jsonl_file = "/path/to/test.jsonl" # Replace with the actual path to your JSONL file +# output_directory = "/path/to/output_markdown" # Replace with the desired output directory +# jsonl_to_markdown(input_jsonl_file, output_directory) + +# This is the main entrypoint to use the script from the command line. +# It takes two arguments: the input JSONL file and the output directory. +# The script will create the output directory if it does not exist. +if __name__ == "__main__": + + if len(sys.argv) != 3: + print("Usage: python jsonl_to_markdown.py ") + sys.exit(1) + + input_file = sys.argv[1] + output_dir = sys.argv[2] + + jsonl_to_markdown(input_file, output_dir) From c2193ddc930da0a43f9903bfdfa7df1407bdbcd8 Mon Sep 17 00:00:00 2001 From: Francisco Bischoff Date: Fri, 4 Apr 2025 16:44:21 +0100 Subject: [PATCH 2/4] Remove first line --- scripts/jsonl_to_markdown.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/jsonl_to_markdown.py b/scripts/jsonl_to_markdown.py index 6c99ea1..c1cd976 100644 --- a/scripts/jsonl_to_markdown.py +++ b/scripts/jsonl_to_markdown.py @@ -24,13 +24,10 @@ def jsonl_to_markdown(input_file, output_dir): data = json.loads(line) text_content = data.get("text", "") - # Convert to Markdown format - markdown_content = f"# Extracted Content (Line {i + 1})\n\n{text_content}" - # Save to a Markdown file output_file = os.path.join(output_dir, f"line_{i + 1}.md") with open(output_file, 'w', encoding='utf-8') as md_file: - md_file.write(markdown_content) + md_file.write(text_content) print(f"Extracted and saved line {i + 1} to {output_file}") except json.JSONDecodeError as e: From 9fd7bc8a96337a504bebe1ec19c03502d50b2bbd Mon Sep 17 00:00:00 2001 From: aman-17 Date: Fri, 4 Apr 2025 11:01:59 -0700 Subject: [PATCH 3/4] added multi_column script --- olmocr/bench/miners/mine_headers_footers.py | 2 +- olmocr/bench/miners/mine_multi_column.py | 407 ++++++++++++++++++++ 2 files changed, 408 insertions(+), 1 deletion(-) create mode 100644 olmocr/bench/miners/mine_multi_column.py diff --git a/olmocr/bench/miners/mine_headers_footers.py b/olmocr/bench/miners/mine_headers_footers.py index 3c4ec09..601fd41 100644 --- a/olmocr/bench/miners/mine_headers_footers.py +++ b/olmocr/bench/miners/mine_headers_footers.py @@ -132,7 +132,7 @@ def detect_headers_footers(pdf_path: str, page_num: int, api_key: str) -> Option parts=[ image_part, types.Part.from_text( - text="""Please tell me which text in this image is part of any headers/footers and would therefore be skipped it someone were reading it outloud to another person. Include page numbers and document-level headers and footers, but not inner subsections.""" + text="""Please extract and display the complete text from the document without omission. Include all sections and ensure nothing is summarized or abbreviated. I want the entire text of the document at any cost. Do not hallucinate""" ), ], ), diff --git a/olmocr/bench/miners/mine_multi_column.py b/olmocr/bench/miners/mine_multi_column.py new file mode 100644 index 0000000..2c77ff5 --- /dev/null +++ b/olmocr/bench/miners/mine_multi_column.py @@ -0,0 +1,407 @@ +""" +mine_multi_column.py - Extract text from PDF documents which has multiple columns. + +This script: +1. Takes a file containing folder path which contains PDF documents as input +2. Process each PDF to generate an HTML representation +3. For each PDF, it renders to an image +4. Uses Claude Sonnet to identify text from multiple columns in the rendered image +5. Creates a test file asserting that the order (before/after) of text should appear +6. Extracts the page from the PDF and saves it to an output folder + +Usage: + python mine_headers_footers.py --input_dir path/to/pdfs --output_dir path/to/output --api_key your_anthropic_api_key +""" + + +import argparse +import asyncio +import concurrent.futures +import json +import os +import random +import re +import subprocess +import uuid +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List + +import pypdf +from anthropic import Anthropic +from bs4 import BeautifulSoup +from playwright.async_api import async_playwright +from syntok.segmenter import process +from tqdm import tqdm + +from olmocr.data.renderpdf import ( + get_png_dimensions_from_base64, + render_pdf_to_base64png, +) + + +def extract_code_block(initial_response): + html_blocks = re.findall(r"```html\n(.*?)```", initial_response, re.DOTALL) + if html_blocks: + return html_blocks[-1].strip() + code_blocks = re.findall(r"```\n(.*?)```", initial_response, re.DOTALL) + if code_blocks: + return code_blocks[-1].strip() + html_blocks_no_newline = re.findall(r"```html(.*?)```", initial_response, re.DOTALL) + if html_blocks_no_newline: + return html_blocks_no_newline[-1].strip() + code_blocks_no_newline = re.findall(r"```(.*?)```", initial_response, re.DOTALL) + if code_blocks_no_newline: + return code_blocks_no_newline[-1].strip() + return None + + +def generate_html_from_image(client, image_base64): + """Call Claude API to generate HTML from an image using a multi-step prompting strategy.""" + png_width, png_height = get_png_dimensions_from_base64(image_base64) + try: + analysis_response = client.messages.create( + model="claude-3-7-sonnet-20250219", + max_tokens=2000, + temperature=0.1, + messages=[ + { + "role": "user", + "content": [ + {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}}, + { + "type": "text", + "text": ( + "Analyze this document and provide a detailed assessment of its structure. " + "Focus on the layout, headings, footers, and any complex formatting. Please be precise." + ), + }, + ], + } + ], + ) + + analysis_text = "" + for content in analysis_response.content: + if content.type == "text": + analysis_text += content.text + + initial_response = client.messages.create( + model="claude-3-7-sonnet-20250219", + max_tokens=6000, + temperature=0.2, + messages=[ + { + "role": "user", + "content": [ + {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_base64}}, + { + "type": "text", + "text": ( + "Render this document as clean, semantic HTML. Here is the analysis of the document structure:\n\n" + f"{analysis_text}\n\n" + "Requirements:\n" + "1. Use appropriate HTML tags for headings, paragraphs, and lists.\n" + "2. Use
and