From a348cd6e8f272d7420459dec9ac4ea01b0fbcc41 Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Fri, 21 Feb 2025 09:57:07 -0800
Subject: [PATCH] olmocr bench runner

---
 olmocr/bench/runners/run_olmocr.py            | 20 ++++++-
 .../olmocr/discoverworld_crazy_table4.md      | 52 +++++++++++++++++++
 2 files changed, 71 insertions(+), 1 deletion(-)
 create mode 100644 olmocr/bench/sample_data/olmocr/discoverworld_crazy_table4.md

diff --git a/olmocr/bench/runners/run_olmocr.py b/olmocr/bench/runners/run_olmocr.py
index 06d4c86..b385f97 100644
--- a/olmocr/bench/runners/run_olmocr.py
+++ b/olmocr/bench/runners/run_olmocr.py
@@ -1,9 +1,15 @@
 import sys
 import glob
+import json
+import os
+import shutil
 import asyncio
 import olmocr.pipeline
 
 # Set sys.argv as if you were running the script from the command line.
+
+workspace_dir = "olmocr/bench/sample_data/olmocr/workspace"
+
 sys.argv = [
     "pipeline.py",              # The script name (can be arbitrary)
     "olmocr/bench/sample_data/olmocr/workspace",            # Positional argument: workspace
@@ -11,5 +17,17 @@ sys.argv = [
 ]
 
 # Call the async main() function.
-asyncio.run(olmocr.pipeline.main())
+# asyncio.run(olmocr.pipeline.main())
 
+# Now, take a produced jsonl files and unpack them into mds
+for jsonl_path in glob.glob(workspace_dir + "/results/*.jsonl"):
+    with open(jsonl_path, "r") as jsonl_f:
+        for line in jsonl_f:
+            data = json.loads(line)
+
+            name = os.path.basename(data["metadata"]["Source-File"])
+
+            with open(f"olmocr/bench/sample_data/olmocr/{name.replace('.pdf', '.md')}", "w") as out_f:
+                out_f.write(data["text"])
+
+shutil.rmtree(workspace_dir)
\ No newline at end of file
diff --git a/olmocr/bench/sample_data/olmocr/discoverworld_crazy_table4.md b/olmocr/bench/sample_data/olmocr/discoverworld_crazy_table4.md
new file mode 100644
index 0000000..c23aff5
--- /dev/null
+++ b/olmocr/bench/sample_data/olmocr/discoverworld_crazy_table4.md
@@ -0,0 +1,52 @@
+Table 4: Baseline model performance on each of the three scoring metrics (task completion, task process, explanatory knowledge discovery) across all 24 DISCOVERY WORLD tasks. Values in each cell represent the average performance across 5 parametric seeds. Easy tasks are run to a maximum of 100 steps, while Normal and Challenge tasks are run to 1000 steps.
+
+| # | Topic           | Task                  | ReACT     | Plan+Execute | Hypothesizer |
+|---|-----------------|-----------------------|-----------|--------------|--------------|
+|   |                 |                       | Pressure  | Completion   | Knowledge    | Pressure  | Completion   | Knowledge |
+| 1 | Proteomics      | Clustering            | 0.87      | 0.20         | 0.20         | 0.80      | 0.00         | 0.00      | 0.90      | 0.40       | 1.00      |
+| 2 | Chemistry       | Exploring Combinations and Hill Climbing | 0.89      | 0.40         | 0.00         | 0.89      | 0.20         | 0.00      | 0.95      | 0.40       | 0.60      |
+| 3 | Archaeology     | Correlations          | 0.90      | 0.40         | 0.00         | 0.90      | 0.40         | 0.00      | 0.95      | 0.60       | 0.40      |
+| 4 | Reactor Lab     | Regression            | 0.91      | 0.60         | 0.00         | 0.94      | 0.40         | 0.00      | 0.87      | 0.00       | 0.00      |
+| 5 | Plant Nutrients | Uncovering systems of rules | 0.91      | 0.40         | 0.00         | 0.91      | 0.40         | 0.00      | 0.87      | 0.00       | 0.00      |
+| 6 | Space Sick      | Open-ended discovery   | 0.78      | 0.60         | 0.00         | 0.78      | 0.40         | 0.10      | 0.80      | 1.00       | 0.60      |
+| 7 | Archaeology     | Correlations          | 0.89      | 0.40         | 0.00         | 0.90      | 0.40         | 0.00      | 0.55      | 0.20       | 0.03      |
+| 8 | Plant Nutrients | Uncovering systems of rules | 0.91      | 0.60         | 0.00         | 0.91      | 0.40         | 0.00      | 0.55      | 0.20       | 0.03      |
+| 9 | Space Sick      | Open-ended discovery   | 0.78      | 0.60         | 0.00         | 0.78      | 0.40         | 0.10      | 0.80      | 1.00       | 0.60      |
+| 10| Easy            | Simplified Clustering  | 0.42      | 0.00         | 0.40         | 0.44      | 0.00         | 0.10      | 0.38      | 0.00       | 0.20      |
+| 11| Normal          | Linear regression      | 0.44      | 0.00         | 0.20         | 0.49      | 0.00         | 0.00      | 0.51      | 0.00       | 0.00      |
+| 12| Challenge       | Quadratic regression   | 0.43      | 0.00         | 0.20         | 0.39      | 0.00         | 0.00      | 0.39      | 0.00       | 0.00      |
+| 13| Easy            | Simplified Clustering  | 0.80      | 0.20         | 0.20         | 0.70      | 0.20         | 0.20      | 0.60      | 0.00       | 0.00      |
+| 14| Normal          | Presence rules         | 0.91      | 0.60         | 0.00         | 0.84      | 0.40         | 0.00      | 0.56      | 0.00       | 0.00      |
+| 15| Challenge       | Logical Rules          | 0.89      | 0.40         | 0.00         | 0.73      | 0.40         | 0.00      | 0.62      | 0.00       | 0.00      |
+| 16| Easy            | Single instrument      | 0.78      | 0.60         | 0.00         | 0.68      | 0.40         | 0.10      | 0.80      | 1.00       | 0.60      |
+| 17| Normal          | Multiple instruments   | 0.58      | 0.00         | 0.13         | 0.45      | 0.00         | 0.13      | 0.16      | 0.00       | 0.33      |
+| 18| Challenge       | Novel instruments      | 0.55      | 0.00         | 0.00         | 0.26      | 0.00         | 0.00      | 0.20      | 0.00       | 0.00      |
+| 19| Easy            | Look-up variables      | 0.33      | 0.00         | 0.00         | 0.53      | 0.00         | 0.07      | 0.13      | 0.40       | 0.00      |
+| 20| Normal          | Measure 2 variables    | 0.51      | 0.00         | 0.05         | 0.34      | 0.00         | 0.00      | 0.11      | 0.00       | 0.00      |
+| 21| Challenge       | Measure 5 variables    | 0.43      | 0.00         | 0.00         | 0.15      | 0.00         | 0.00      | 0.22      | 0.00       | 0.03      |
+| 22| Easy            | Single noun            | 0.40      | 0.40         | 0.20         | 0.30      | 0.00         | 0.00      | 0.20      | 0.20       | 0.00      |
+| 23| Normal          | Noun and verb          | 0.20      | 0.00         | 0.00         | 0.68      | 0.40         | 0.00      | 0.54      | 0.40       | 0.00      |
+| 24| Challenge       | Noun, adj., and verb   | 0.49      | 0.00         | 0.00         | 0.55      | 0.20         | 0.05      | 0.15      | 0.00       | 0.00      |
+|   | Average (Easy)  |                       | 0.59      | 0.38         | 0.25         | 0.56      | 0.18         | 0.11      | 0.56      | 0.28       | 0.34      |
+|   | Average (Normal)|                       | 0.63      | 0.18         | 0.14         | 0.64      | 0.18         | 0.02      | 0.58      | 0.23       | 0.19      |
+|   | Average (Challenge) |                   | 0.63      | 0.18         | 0.10         | 0.50      | 0.15         | 0.01      | 0.49      | 0.08       | 0.08      |
+
+Table 5: Baseline model performance on each of the three scoring metrics (task completion, task process, explanatory knowledge discovery) across all 10 unit test tasks. Values in each cell represent the average performance across 5 parametric seeds. Unit tests tasks are run to a maximum of 100 steps.
+
+| # | Unit Test Topic | ReACT | Plan+Execute | Hypothesizer |
+|---|-----------------|-------|--------------|--------------|
+|   |                 | Pressure | Completion | Knowledge | Pressure | Completion | Knowledge | Pressure | Completion | Knowledge |
+| 25| Multi-turn dialog with an agent | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
+| 26| Measure an object with an instrument | 0.87 | 0.60 | 0.73 | 0.40 | 1.00 | 1.00 |
+| 27| Pick-and-place object | 0.90 | 0.80 | 0.80 | 0.60 | 1.00 | 1.00 |
+| 28| Read Discovery/Feed posts | 1.00 | 1.00 | 0.90 | 0.80 | 1.00 | 1.00 |
+| 29| Move through doors | 0.58 | 0.20 | 0.25 | 0.00 | 0.30 | 0.00 |
+| 30| Using keys with doors | 0.69 | 0.20 | 0.54 | 0.00 | 0.69 | 0.00 |
+| 31| Navigate to a specific room in a house | 0.20 | 0.20 | 0.20 | 0.00 | 0.20 | 0.20 |
+| 32| Search an environment for an object | 0.80 | 0.80 | 0.60 | 0.60 | 1.00 | 1.00 |
+| 33| Interact with a moving agent | 0.60 | 0.20 | 0.53 | 0.00 | 0.53 | 0.20 |
+|   | Average (Unit Tests) | 0.76 | 0.60 | 0.66 | 0.44 | 0.77 | 0.64 |
+
+4.2 Baseline Agent Models
+
+The baseline agents are described below, with model performance on Discovery tasks shown in Table 4, and performance on Unit Tests shown in Table 5. We use the GPT-4G model for all our agents due to its higher performance and lower cost compared to other models. For space we provide
\ No newline at end of file