mirror of
https://github.com/allenai/olmocr.git
synced 2025-07-31 21:13:57 +00:00
olmocr bench runner
This commit is contained in:
parent
c20e3c0702
commit
a348cd6e8f
@ -1,9 +1,15 @@
|
|||||||
import sys
|
import sys
|
||||||
import glob
|
import glob
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
import asyncio
|
import asyncio
|
||||||
import olmocr.pipeline
|
import olmocr.pipeline
|
||||||
|
|
||||||
# Set sys.argv as if you were running the script from the command line.
|
# Set sys.argv as if you were running the script from the command line.
|
||||||
|
|
||||||
|
workspace_dir = "olmocr/bench/sample_data/olmocr/workspace"
|
||||||
|
|
||||||
sys.argv = [
|
sys.argv = [
|
||||||
"pipeline.py", # The script name (can be arbitrary)
|
"pipeline.py", # The script name (can be arbitrary)
|
||||||
"olmocr/bench/sample_data/olmocr/workspace", # Positional argument: workspace
|
"olmocr/bench/sample_data/olmocr/workspace", # Positional argument: workspace
|
||||||
@ -11,5 +17,17 @@ sys.argv = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
# Call the async main() function.
|
# Call the async main() function.
|
||||||
asyncio.run(olmocr.pipeline.main())
|
# asyncio.run(olmocr.pipeline.main())
|
||||||
|
|
||||||
|
# Now, take a produced jsonl files and unpack them into mds
|
||||||
|
for jsonl_path in glob.glob(workspace_dir + "/results/*.jsonl"):
|
||||||
|
with open(jsonl_path, "r") as jsonl_f:
|
||||||
|
for line in jsonl_f:
|
||||||
|
data = json.loads(line)
|
||||||
|
|
||||||
|
name = os.path.basename(data["metadata"]["Source-File"])
|
||||||
|
|
||||||
|
with open(f"olmocr/bench/sample_data/olmocr/{name.replace('.pdf', '.md')}", "w") as out_f:
|
||||||
|
out_f.write(data["text"])
|
||||||
|
|
||||||
|
shutil.rmtree(workspace_dir)
|
@ -0,0 +1,52 @@
|
|||||||
|
Table 4: Baseline model performance on each of the three scoring metrics (task completion, task process, explanatory knowledge discovery) across all 24 DISCOVERY WORLD tasks. Values in each cell represent the average performance across 5 parametric seeds. Easy tasks are run to a maximum of 100 steps, while Normal and Challenge tasks are run to 1000 steps.
|
||||||
|
|
||||||
|
| # | Topic | Task | ReACT | Plan+Execute | Hypothesizer |
|
||||||
|
|---|-----------------|-----------------------|-----------|--------------|--------------|
|
||||||
|
| | | | Pressure | Completion | Knowledge | Pressure | Completion | Knowledge |
|
||||||
|
| 1 | Proteomics | Clustering | 0.87 | 0.20 | 0.20 | 0.80 | 0.00 | 0.00 | 0.90 | 0.40 | 1.00 |
|
||||||
|
| 2 | Chemistry | Exploring Combinations and Hill Climbing | 0.89 | 0.40 | 0.00 | 0.89 | 0.20 | 0.00 | 0.95 | 0.40 | 0.60 |
|
||||||
|
| 3 | Archaeology | Correlations | 0.90 | 0.40 | 0.00 | 0.90 | 0.40 | 0.00 | 0.95 | 0.60 | 0.40 |
|
||||||
|
| 4 | Reactor Lab | Regression | 0.91 | 0.60 | 0.00 | 0.94 | 0.40 | 0.00 | 0.87 | 0.00 | 0.00 |
|
||||||
|
| 5 | Plant Nutrients | Uncovering systems of rules | 0.91 | 0.40 | 0.00 | 0.91 | 0.40 | 0.00 | 0.87 | 0.00 | 0.00 |
|
||||||
|
| 6 | Space Sick | Open-ended discovery | 0.78 | 0.60 | 0.00 | 0.78 | 0.40 | 0.10 | 0.80 | 1.00 | 0.60 |
|
||||||
|
| 7 | Archaeology | Correlations | 0.89 | 0.40 | 0.00 | 0.90 | 0.40 | 0.00 | 0.55 | 0.20 | 0.03 |
|
||||||
|
| 8 | Plant Nutrients | Uncovering systems of rules | 0.91 | 0.60 | 0.00 | 0.91 | 0.40 | 0.00 | 0.55 | 0.20 | 0.03 |
|
||||||
|
| 9 | Space Sick | Open-ended discovery | 0.78 | 0.60 | 0.00 | 0.78 | 0.40 | 0.10 | 0.80 | 1.00 | 0.60 |
|
||||||
|
| 10| Easy | Simplified Clustering | 0.42 | 0.00 | 0.40 | 0.44 | 0.00 | 0.10 | 0.38 | 0.00 | 0.20 |
|
||||||
|
| 11| Normal | Linear regression | 0.44 | 0.00 | 0.20 | 0.49 | 0.00 | 0.00 | 0.51 | 0.00 | 0.00 |
|
||||||
|
| 12| Challenge | Quadratic regression | 0.43 | 0.00 | 0.20 | 0.39 | 0.00 | 0.00 | 0.39 | 0.00 | 0.00 |
|
||||||
|
| 13| Easy | Simplified Clustering | 0.80 | 0.20 | 0.20 | 0.70 | 0.20 | 0.20 | 0.60 | 0.00 | 0.00 |
|
||||||
|
| 14| Normal | Presence rules | 0.91 | 0.60 | 0.00 | 0.84 | 0.40 | 0.00 | 0.56 | 0.00 | 0.00 |
|
||||||
|
| 15| Challenge | Logical Rules | 0.89 | 0.40 | 0.00 | 0.73 | 0.40 | 0.00 | 0.62 | 0.00 | 0.00 |
|
||||||
|
| 16| Easy | Single instrument | 0.78 | 0.60 | 0.00 | 0.68 | 0.40 | 0.10 | 0.80 | 1.00 | 0.60 |
|
||||||
|
| 17| Normal | Multiple instruments | 0.58 | 0.00 | 0.13 | 0.45 | 0.00 | 0.13 | 0.16 | 0.00 | 0.33 |
|
||||||
|
| 18| Challenge | Novel instruments | 0.55 | 0.00 | 0.00 | 0.26 | 0.00 | 0.00 | 0.20 | 0.00 | 0.00 |
|
||||||
|
| 19| Easy | Look-up variables | 0.33 | 0.00 | 0.00 | 0.53 | 0.00 | 0.07 | 0.13 | 0.40 | 0.00 |
|
||||||
|
| 20| Normal | Measure 2 variables | 0.51 | 0.00 | 0.05 | 0.34 | 0.00 | 0.00 | 0.11 | 0.00 | 0.00 |
|
||||||
|
| 21| Challenge | Measure 5 variables | 0.43 | 0.00 | 0.00 | 0.15 | 0.00 | 0.00 | 0.22 | 0.00 | 0.03 |
|
||||||
|
| 22| Easy | Single noun | 0.40 | 0.40 | 0.20 | 0.30 | 0.00 | 0.00 | 0.20 | 0.20 | 0.00 |
|
||||||
|
| 23| Normal | Noun and verb | 0.20 | 0.00 | 0.00 | 0.68 | 0.40 | 0.00 | 0.54 | 0.40 | 0.00 |
|
||||||
|
| 24| Challenge | Noun, adj., and verb | 0.49 | 0.00 | 0.00 | 0.55 | 0.20 | 0.05 | 0.15 | 0.00 | 0.00 |
|
||||||
|
| | Average (Easy) | | 0.59 | 0.38 | 0.25 | 0.56 | 0.18 | 0.11 | 0.56 | 0.28 | 0.34 |
|
||||||
|
| | Average (Normal)| | 0.63 | 0.18 | 0.14 | 0.64 | 0.18 | 0.02 | 0.58 | 0.23 | 0.19 |
|
||||||
|
| | Average (Challenge) | | 0.63 | 0.18 | 0.10 | 0.50 | 0.15 | 0.01 | 0.49 | 0.08 | 0.08 |
|
||||||
|
|
||||||
|
Table 5: Baseline model performance on each of the three scoring metrics (task completion, task process, explanatory knowledge discovery) across all 10 unit test tasks. Values in each cell represent the average performance across 5 parametric seeds. Unit tests tasks are run to a maximum of 100 steps.
|
||||||
|
|
||||||
|
| # | Unit Test Topic | ReACT | Plan+Execute | Hypothesizer |
|
||||||
|
|---|-----------------|-------|--------------|--------------|
|
||||||
|
| | | Pressure | Completion | Knowledge | Pressure | Completion | Knowledge | Pressure | Completion | Knowledge |
|
||||||
|
| 25| Multi-turn dialog with an agent | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
|
||||||
|
| 26| Measure an object with an instrument | 0.87 | 0.60 | 0.73 | 0.40 | 1.00 | 1.00 |
|
||||||
|
| 27| Pick-and-place object | 0.90 | 0.80 | 0.80 | 0.60 | 1.00 | 1.00 |
|
||||||
|
| 28| Read Discovery/Feed posts | 1.00 | 1.00 | 0.90 | 0.80 | 1.00 | 1.00 |
|
||||||
|
| 29| Move through doors | 0.58 | 0.20 | 0.25 | 0.00 | 0.30 | 0.00 |
|
||||||
|
| 30| Using keys with doors | 0.69 | 0.20 | 0.54 | 0.00 | 0.69 | 0.00 |
|
||||||
|
| 31| Navigate to a specific room in a house | 0.20 | 0.20 | 0.20 | 0.00 | 0.20 | 0.20 |
|
||||||
|
| 32| Search an environment for an object | 0.80 | 0.80 | 0.60 | 0.60 | 1.00 | 1.00 |
|
||||||
|
| 33| Interact with a moving agent | 0.60 | 0.20 | 0.53 | 0.00 | 0.53 | 0.20 |
|
||||||
|
| | Average (Unit Tests) | 0.76 | 0.60 | 0.66 | 0.44 | 0.77 | 0.64 |
|
||||||
|
|
||||||
|
4.2 Baseline Agent Models
|
||||||
|
|
||||||
|
The baseline agents are described below, with model performance on Discovery tasks shown in Table 4, and performance on Unit Tests shown in Table 5. We use the GPT-4G model for all our agents due to its higher performance and lower cost compared to other models. For space we provide
|
Loading…
x
Reference in New Issue
Block a user