diff --git a/olmocr/bench/benchmark.py b/olmocr/bench/benchmark.py index 126b2f7..eb91a66 100644 --- a/olmocr/bench/benchmark.py +++ b/olmocr/bench/benchmark.py @@ -199,7 +199,7 @@ def main(): parser.add_argument("--test_report", type=str, default=None, help="Generate an HTML report of test results. Provide a filename (e.g., results.html).") args = parser.parse_args() - input_folder = args.dir + input_folder = args.dir if os.path.isdir(args.dir) else os.path.dirname(args.dir) n_bootstrap = args.bootstrap_samples ci_level = args.confidence_level pdf_folder = os.path.join(input_folder, "pdfs") @@ -216,7 +216,11 @@ def main(): pdf_basenames = [os.path.relpath(p, pdf_folder) for p in all_pdf_files] - jsonl_files = glob.glob(os.path.join(input_folder, "*.jsonl")) + if os.path.isfile(args.dir): + jsonl_files = [args.dir] + else: + jsonl_files = glob.glob(os.path.join(input_folder, "*.jsonl")) + if not jsonl_files: print(f"Error: No .jsonl files found in {input_folder}.", file=sys.stderr) sys.exit(1) diff --git a/olmocr/bench/runners/run_chatgpt.py b/olmocr/bench/runners/run_chatgpt.py index edb743e..1420616 100644 --- a/olmocr/bench/runners/run_chatgpt.py +++ b/olmocr/bench/runners/run_chatgpt.py @@ -1,18 +1,29 @@ import json import os +from typing import Literal from openai import OpenAI +from olmocr.bench.prompts import build_basic_prompt from olmocr.data.renderpdf import render_pdf_to_base64png from olmocr.prompts.anchor import get_anchor_text from olmocr.prompts.prompts import ( PageResponse, + build_finetuning_prompt, build_openai_silver_data_prompt, openai_response_format_schema, ) -def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-06", temperature: float = 0.1) -> str: +def run_chatgpt( + pdf_path: str, + page_num: int = 1, + model: str = "gpt-4o-2024-08-06", + temperature: float = 0.1, + target_longest_image_dim: int = 2048, + prompt_template: Literal["full", "basic", "finetune"] = "finetune", + response_template: Literal["plain", "json"] = "json", +) -> str: """ Convert page of a PDF file to markdown using the commercial openAI APIs. @@ -25,7 +36,7 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0 str: The OCR result in markdown format. """ # Convert the first page of the PDF to a base64-encoded PNG image. - image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=2048) + image_base64 = render_pdf_to_base64png(pdf_path, page_num=page_num, target_longest_image_dim=target_longest_image_dim) anchor_text = get_anchor_text(pdf_path, page_num, pdf_engine="pdfreport") if not os.getenv("OPENAI_API_KEY"): @@ -33,20 +44,29 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + if prompt_template == "full": + prompt = build_openai_silver_data_prompt(anchor_text) + elif prompt_template == "finetune": + prompt = build_finetuning_prompt(anchor_text) + elif prompt_template == "basic": + prompt = build_basic_prompt() + else: + raise ValueError("Unknown prompt template") + response = client.chat.completions.create( model=model, messages=[ { "role": "user", "content": [ - {"type": "text", "text": build_openai_silver_data_prompt(anchor_text)}, + {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}}, ], } ], temperature=temperature, max_tokens=3000, - response_format=openai_response_format_schema(), + response_format=openai_response_format_schema() if response_template == "json" else None, ) raw_response = response.choices[0].message.content @@ -55,7 +75,10 @@ def run_chatgpt(pdf_path: str, page_num: int = 1, model: str = "gpt-4o-2024-08-0 assert response.choices[0].message.refusal is None assert response.choices[0].finish_reason == "stop" - data = json.loads(raw_response) - data = PageResponse(**data) + if response_template == "json": + data = json.loads(raw_response) + data = PageResponse(**data) - return data.natural_text + return data.natural_text + else: + return raw_response diff --git a/olmocr/bench/templates/all_done_latex.html b/olmocr/bench/templates/all_done_latex.html new file mode 100644 index 0000000..5784f94 --- /dev/null +++ b/olmocr/bench/templates/all_done_latex.html @@ -0,0 +1,54 @@ + + + + + + + All Done! + + + +
+

All Done! 🎉

+

You have reviewed all equations in the dataset.

+
+ +
+
+ + + \ No newline at end of file diff --git a/olmocr/bench/templates/review_latex.html b/olmocr/bench/templates/review_latex.html index 079f391..c0be69b 100644 --- a/olmocr/bench/templates/review_latex.html +++ b/olmocr/bench/templates/review_latex.html @@ -1,8 +1,9 @@ + - + Equation Verification @@ -30,15 +31,15 @@ overflow: hidden; } .pdf-viewer { - flex: 2; + flex: 2; /* Increased from 1 to 2 to make PDF larger */ border-right: 1px solid #ddd; overflow: hidden; position: relative; } - + /* Updated PDF container size */ #pdf-container { - width: 200%; - height: 200%; + width: 200%; /* New fixed width */ + height: 200%; /* New fixed height */ overflow: auto; } #zoom-controls { @@ -74,7 +75,7 @@ .test-item.rejected { background-color: #f8d7da; } - + /* The equation-display now stores the raw LaTeX in a data attribute */ .equation-display { padding: 10px; margin: 5px 0; @@ -82,7 +83,7 @@ border-radius: 4px; background-color: #f9f9f9; overflow-x: auto; - font-size: 1.2em; + font-size: 1.2em; /* Larger font for equations */ } .button-group { display: flex; @@ -126,7 +127,7 @@ background-color: #007bff; width: 0%; } - + /* Make MathJax equations more visible */ .MathJax { font-size: 120% !important; } @@ -168,7 +169,7 @@

Equations ({{ tests|length }})

{% for test in tests %} - +
{{ test.text|safe }} @@ -176,6 +177,7 @@
+
@@ -184,19 +186,23 @@
- \ No newline at end of file + + \ No newline at end of file diff --git a/olmocr/loadertest.py b/olmocr/loadertest.py new file mode 100644 index 0000000..61f9f2d --- /dev/null +++ b/olmocr/loadertest.py @@ -0,0 +1,86 @@ +import json +from concurrent.futures import ProcessPoolExecutor, as_completed + +import boto3 +from tqdm import tqdm + +# Configuration +BUCKET = "ai2-llm" +PREFIX = "pretraining-data/sources/soldni-open-access-books/v0/pipeline/results" +OUTPUT_FILENAME = "all_completed_files.txt" + + +def process_file(key: str): + """ + Process a single S3 file given by its key. + Reads a jsonl file from S3, decodes each line, + extracts the 'Source-File' from the 'metadata' field, + and returns a list of these source file strings. + """ + # Create a new S3 client in the worker thread (thread-safe) + s3 = boto3.client("s3") + extracted_lines = [] + try: + response = s3.get_object(Bucket=BUCKET, Key=key) + for raw_line in response["Body"].iter_lines(): + try: + # Decode the line from bytes to text + line_str = raw_line.decode("utf-8") + except UnicodeDecodeError as e: + print(f"Skipping a line in {key} due to decode error: {e}") + continue + try: + data = json.loads(line_str) + except json.JSONDecodeError as e: + print(f"Skipping a malformed json line in {key}: {e}") + continue + # Extract 'Source-File' from metadata if present + metadata = data.get("metadata", {}) + source_file = metadata.get("Source-File") + if source_file: + extracted_lines.append(source_file) + except Exception as e: + print(f"Error processing file {key}: {e}") + return extracted_lines + + +def main(): + s3 = boto3.client("s3") + paginator = s3.get_paginator("list_objects_v2") + page_iterator = paginator.paginate(Bucket=BUCKET, Prefix=PREFIX) + + # Gather all S3 object keys under the specified prefix + keys = [] + for page in page_iterator: + if "Contents" not in page: + continue + for obj in page["Contents"]: + keys.append(obj["Key"]) + + print(f"Found {len(keys)} files to process.") + + # Open the output file for writing + with open(OUTPUT_FILENAME, "w", encoding="utf-8") as output_file: + # Create a thread pool to process files concurrently. + # Adjust max_workers based on your environment and workload. + with ProcessPoolExecutor() as executor: + # Submit all processing jobs and map each future to its key + future_to_key = {executor.submit(process_file, key): key for key in keys} + # Use tqdm to wrap the as_completed iterator for progress display + for future in tqdm(as_completed(future_to_key), total=len(future_to_key), desc="Processing files"): + try: + source_files = future.result() + # Write each extracted line to the output file as soon as the future completes + for source in source_files: + output_file.write(source + "\n") + # Optionally flush after each completed task + output_file.flush() + except Exception as e: + key = future_to_key[future] + print(f"Exception occurred for file {key}: {e}") + + print(f"Finished writing the source file names to {OUTPUT_FILENAME}") + + +if __name__ == "__main__": + main()