From 16a32445a254cd7de433ebdfa5ef597403853f1e Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Wed, 19 Feb 2025 16:10:46 -0800 Subject: [PATCH] olmocr running --- olmocr/bench/runners/run_gotocr.py | 2 +- olmocr/bench/runners/run_olmocr.py | 15 + .../got_ocr/discoverworld_crazy_table4.md | 543 ++++++++++++++++++ .../marker/discoverworld_crazy_table4.md | 63 ++ .../sample_data/marker/multi_column_miss.md | 12 +- .../mineru/discoverworld_crazy_table4.md | 13 + 6 files changed, 641 insertions(+), 7 deletions(-) create mode 100644 olmocr/bench/runners/run_olmocr.py create mode 100644 olmocr/bench/sample_data/got_ocr/discoverworld_crazy_table4.md create mode 100644 olmocr/bench/sample_data/marker/discoverworld_crazy_table4.md create mode 100644 olmocr/bench/sample_data/mineru/discoverworld_crazy_table4.md diff --git a/olmocr/bench/runners/run_gotocr.py b/olmocr/bench/runners/run_gotocr.py index 22efcbd..fd97ba1 100644 --- a/olmocr/bench/runners/run_gotocr.py +++ b/olmocr/bench/runners/run_gotocr.py @@ -72,7 +72,7 @@ def run(pdf_folder): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Convert all PDF files in a folder to markdown using GOT-OCR and save them to a sibling 'marker' folder." + description="Convert all PDF files in a folder to markdown using GOT-OCR and save them to a sibling 'got_ocr' folder." ) parser.add_argument( "pdf_folder", diff --git a/olmocr/bench/runners/run_olmocr.py b/olmocr/bench/runners/run_olmocr.py new file mode 100644 index 0000000..06d4c86 --- /dev/null +++ b/olmocr/bench/runners/run_olmocr.py @@ -0,0 +1,15 @@ +import sys +import glob +import asyncio +import olmocr.pipeline + +# Set sys.argv as if you were running the script from the command line. +sys.argv = [ + "pipeline.py", # The script name (can be arbitrary) + "olmocr/bench/sample_data/olmocr/workspace", # Positional argument: workspace + "--pdfs", *list(glob.glob("olmocr/bench/sample_data/pdfs/*.pdf")), # PDF paths +] + +# Call the async main() function. +asyncio.run(olmocr.pipeline.main()) + diff --git a/olmocr/bench/sample_data/got_ocr/discoverworld_crazy_table4.md b/olmocr/bench/sample_data/got_ocr/discoverworld_crazy_table4.md new file mode 100644 index 0000000..a91809d --- /dev/null +++ b/olmocr/bench/sample_data/got_ocr/discoverworld_crazy_table4.md @@ -0,0 +1,543 @@ +Table 4: Baseline model performance on each of the three scoring metrics (task completion, task process, +explanatory knowledge discovery) across all 24 DISCOVERYWORLD tasks. Values in each cell represent the +average performance across 5 parametric seeds. Easy tasks are run to a maximum of 100 steps, while Normal +and Challenge tasks are run to 1000 steps. +ReACT +Plan+Execute +Hypothesizer +# +Topic +Task +# +Topic +Task +# +Topic +Task +# +Topic +Task +# + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Protometrics +Clustering +1 +Easy +Simplified Clustering +0.37 +0.20 +0.20 +0.20 +0.20 +0.00 +0.00 +0.00 +0.40 +0.00 +2 +Easy +Simplified Clustering +0.40 +0.40 +0.40 +0.40 +0.64 +0.20 +0.00 +0.93 +0.40 +0.40 +3 +Challenge +Clustering (3D) +0.28 +0.40 +0.60 +0.58 +0.20 +0.00 +0.93 +0.44 +0.60 +0.60 +4 +Semi- +Chemistry +Exploring Combinations and Hill Climbing +0.00 +0.00 +0.00 +0.00 +0.01 +0.00 +0.00 +0.00 +0.05 +0.00 +5 +Normal +Mix of 3 substances +0.82 +0.00 +0.00 +0.87 +0.40 +0.00 +0.93 +0.40 +0.00 +0.6 +6 +Challenge +Mix of 4 substances +0.90 +0.40 +0.00 +0.90 +0.40 +0.00 +0.87 +0.00 +0.00 +0.00 +7 +Semi- +Easy +Simple instrument +0.27 +0.60 +0.00 +0.33 +0.20 +0.00 +0.60 +0.20 +0.50 +0.00 +8 +Challenge +Instrument Use +0.40 +0.40 +0.40 +0.48 +0.00 +0.00 +0.55 +0.40 +0.40 +0.40 +9 +Challenge +Simplified Clustering +0.40 +0.40 +1.00 +0.40 +0.00 +0.00 +0.55 +0.41 +0.00 +0.00 +9 +Reactor Lab +Regression +0.42 +0.00 +0.00 +0.40 +0.01 +0.10 +0.38 +0.00 +0.20 +11 +Normal +Linear regression +0.44 +0.00 +0.20 +0.49 +0.00 +0.00 +0.51 +0.00 +0.00 +12 +Challenge +Quadratic regression +0.43 +0.00 +0.20 +0.39 +0.00 +0.00 +0.39 +0.00 +0.00 +0.00 +13 +Semi- +Simplified rules +0.82 +0.00 +0.20 +0.70 +0.20 +0.20 +0.60 +0.00 +0.00 +14 +Normal +Presence rules +0.91 +0.60 +0.00 +0.84 +0.40 +0.00 +0.56 +0.00 +0.00 +15 +Semi- +Simplified Data +0.89 +0.40 +0.00 +0.73 +0.40 +0.00 +0.62 +0.00 +0.00 +Space Sick +Open-ended discovery +0.6 +0.00 +0.00 +0.68 +0.40 +0.10 +0.16 +0.00 +0.00 +16 +Easy +Single instrument +0.78 +0.00 +0.00 +0.68 +0.44 +0.10 +0.16 +0.00 +0.01 +17 +Semi- +Simplified +0.78 +0.00 +0.00 +0.69 +0.40 +0.10 +0.16 +0.01 +0.00 +18 +Challenge +Novel instruments +0.55 +0.00 +0.00 +0.26 +0.00 +0.00 +0.20 +0.00 +0.00 +Rocket Science +Multi-step measurements and applying formulas +0.3 +0.00 +0.00 +0.00 +0.02 +0.00 +0.00 +0.00 +0.03 +20 +Normal +Semi- +Simplified +0.51 +0.00 +0.05 +0.34 +0.00 +0.00 +0.11 +0.00 +0.00 +21 +Challenge +Measure 2 variables +0.51 +0.00 +0.05 +0.30 +0.00 +0.00 +0.11 +0.0.0 +0.00 +22 +Challenge +Measure 5 variables +0.43 +0.00 +0.00 +0.15 +0.00 +0.00 +0.22 +0.00 +0.03 +23 +Semi- +Simplified +0.50 +0.00 +0.00 +0.00 +0.20 +0.0 +0.00 +0.00 +0.00 +24 +Easy +Single noun +0.40 +0.40 +0.20 +0.30 +0.00 +0.00 +0.20 +0.20 +0.00 +25 +Normal +Noun and verb +0.20 +0.00 +0.00 +0.45 +0.20 +0.00 +0.15 +0.40 +0.00 +26 +Challenge +Noun, all., and verb +0.49 +0.00 +0.00 +0.49 +0.00 +0.00 +0.20 +0.01 +0.00 +27 +Semi- +Simplified +0.56 +0.18 +0.11 +0.56 +0.18 +0.11 +0.58 +0.25 +0.34 +Average (Normal) +0.63 +0.18 +0.14 +0.64 +0.18 +0.02 +0.58 +0.23 +0.19 +Average (Challenge) +0.63 +0.18 +0.10 +0.50 +0.15 +0.01 +0.49 +0.08 +0.08 +Table 5: Baseline model performance on each of the three scoring metrics (task completion, task process, ex- +planatory knowledge discovery) across all 10 unit test tasks. Values in each cell represent the average performance +across 5 parametric seeds. Unit tests tasks are run to a maximum of 100 steps. +ReACT +Plan+Execute +Hypothesizer +# +Unit Test Topic +0.25 +0.00 +0.00 +0.00 +0.04 +0.00 +0.00 +0.00 +0.06 +25 +Multi-turn dialog with an agent +1.00 +1.00 +1.00 +1.00 +1.0.0 +1.00 +1.00 +26 +Measure an object with an instrument +0.87 +0.60 +0.73 +0.40 +1.00 +1.00 +1.00 +28 +Semi- +Simplified +0.00 +0.00 +0.00 +0.08 +0.00 +0.00 +0.00 +0.0.0 +0.00 +29 +Pick-and-give object +1.00 +1.00 +1.00 +1.09 +1.00 +1.00 +1.00 +29 +Read DiscoveryFeed posts +0.50 +0.20 +0.25 +0.00 +0.30 +0.00 +0.00 +0.00 +0.50 +31 +Using keys with doors +0.69 +0.20 +0.54 +0.00 +0.69 +0.00 +32 +Navigate to a specific room in a house +0.20 +0.20 +0.20 +0.0 +0.00 +0.20 +0.20 +0.33 +33 +Search an environment for an object +0.60 +0.80 +0.60 +0.00 +0.53 +0.20 +34 +Interact with a moving agent +0.60 +0.20 +0.55 +0.00 +0.53 +0.20 +Average (Unit Tests) +0.78 +0.60 +0.00 +0.44 +0.73 +0.64 +4.2 +Baseline Agent Models +The baseline agents are described below, with model performance on Discovery tasks shown in +Table 4, and performance on Unit Tests shown in Table 5. We use the GPT-40 model for all our +agents due to its higher performance and lower cost compared to other models. For space we provide +7 \ No newline at end of file diff --git a/olmocr/bench/sample_data/marker/discoverworld_crazy_table4.md b/olmocr/bench/sample_data/marker/discoverworld_crazy_table4.md new file mode 100644 index 0000000..f321a64 --- /dev/null +++ b/olmocr/bench/sample_data/marker/discoverworld_crazy_table4.md @@ -0,0 +1,63 @@ +Table 4: Baseline model performance on each of the three scoring metrics *(task completion, task process, explanatory knowledge discovery)* across all 24 DISCOVERYWORLD tasks. Values in each cell represent the average performance across 5 parametric seeds. *Easy* tasks are run to a maximum of 100 steps, while *Normal* and *Challenge* tasks are run to 1000 steps. + +| | | | ReACT | | | | | Plan+Execute | | | Hypothesizer | | | +|-----------------------------|-----------------|------------------------------------------------------------|-----------------------------------------------|------|------|-------------------------|------|-------------------------|-----------|------|--------------|-----------|--| +| | | | Completion
Knowledge
Procedure | | | Completion
Procedure | | Completion
Procedure | | | | | | +| # | Topic | Task | | | | | | | Knowledge | | | Knowledge | | +| | Proteomics | Clustering | | | | | | | | | | | | +| 1 | Easy | Simplified Clustering | 0.87 | 0.20 | 0.20 | | 0.80 | 0.00 | 0.00 | 0.90 | 0.40 | 1.00 | | +| 2 | Normal | Clustering (2D) | 0.88 | 0.40 | 0.40 | | 0.68 | 0.20 | 0.00 | 0.93 | 0.40 | 0.40 | | +| 3 | Challenge | Clustering (3D) | 0.88 | 0.40 | 0.60 | | 0.58 | 0.20 | 0.00 | 0.93 | 0.40 | 0.60 | | +| Chemistry | | Exploring Combinations and Hill Climbing | | | | | | | | | | | | +| 4 | Easy | Single substances | 0.87 | 1.00 | 1.00 | | 0.70 | 0.60 | 0.40 | 0.90 | 0.00 | 0.40 | | +| 5 | Normal | Mix of 3 substances | 0.82 | 0.00 | 0.00 | | 0.87 | 0.40 | 0.00 | 0.93 | 0.60 | 0.40 | | +| 6 | Challenge | Mix of 4 substances | 0.90 | 0.40 | 0.00 | | 0.90 | 0.40 | 0.00 | 0.87 | 0.00 | 0.00 | | +| Archaeology
Correlations | | | | | | | | | | | | | | +| 7 | Easy | Simple instrument | 0.27 | 0.60 | 0.00 | | 0.33 | 0.20 | 0.00 | 0.60 | 0.20 | 0.50 | | +| 8 | Normal | Instrument Use | 0.72 | 0.40 | 0.30 | | 0.74 | 0.00 | 0.00 | 0.64 | 0.40 | 0.40 | | +| 9 | Challenge | Correlation | 0.46 | 0.20 | 0.00 | | 0.46 | 0.00 | 0.05 | 0.55 | 0.20 | 0.05 | | +| | Reactor Lab | Regression | | | | | | | | | | | | +| 10 | Easy | Slope only | 0.42 | 0.00 | 0.40 | | 0.44 | 0.00 | 0.10 | 0.38 | 0.00 | 0.20 | | +| 11 | Normal | Linear regression | 0.44 | 0.00 | 0.20 | | 0.49 | 0.00 | 0.00 | 0.51 | 0.00 | 0.00 | | +| 12 | Challenge | Quadratic regression | 0.43 | 0.00 | 0.20 | | 0.39 | 0.00 | 0.00 | 0.39 | 0.00 | 0.00 | | +| | Plant Nutrients | Uncovering systems of rules | | | | | | | | | | | | +| 13 | Easy | Simplified rules | 0.80 | 0.20 | 0.20 | | 0.70 | 0.20 | 0.20 | 0.60 | 0.00 | 0.00 | | +| 14 | Normal | Presence rules | 0.91 | 0.60 | 0.00 | | 0.84 | 0.40 | 0.00 | 0.56 | 0.00 | 0.00 | | +| 15 | Challenge | Logical Rules | 0.89 | 0.40 | 0.00 | | 0.73 | 0.40 | 0.00 | 0.62 | 0.00 | 0.00 | | +| | Space Sick | Open-ended discovery | | | | | | | | | | | | +| 16 | Easy | Single instrument | 0.78 | 0.60 | 0.00 | | 0.68 | 0.40 | 0.10 | 0.80 | 1.00 | 0.60 | | +| 17 | Normal | Multiple instruments | 0.58 | 0.00 | 0.13 | | 0.45 | 0.00 | 0.13 | 0.16 | 0.00 | 0.33 | | +| 18 | Challenge | Novel instruments | 0.55 | 0.00 | 0.00 | | 0.26 | 0.00 | 0.00 | 0.20 | 0.00 | 0.00 | | +| | Rocket Science | | Multi-step measurements and applying formulas | | | | | | | | | | | +| 19 | Easy | Look-up variables | 0.33 | 0.00 | 0.00 | | 0.53 | 0.00 | 0.07 | 0.13 | 0.40 | 0.00 | | +| 20 | Normal | Measure 2 variables | 0.51 | 0.00 | 0.05 | | 0.34 | 0.00 | 0.00 | 0.11 | 0.00 | 0.00 | | +| 21 | Challenge | Measure 5 variables | 0.43 | 0.00 | 0.00 | | 0.15 | 0.00 | 0.00 | 0.22 | 0.00 | 0.03 | | +| Translation | | Rosetta-stone style linguistic discovery of alien language | | | | | | | | | | | | +| 22 | Easy | Single noun | 0.40 | 0.40 | 0.20 | | 0.30 | 0.00 | 0.00 | 0.20 | 0.20 | 0.00 | | +| 23 | Normal | Noun and verb | 0.20 | 0.00 | 0.00 | | 0.68 | 0.40 | 0.00 | 0.84 | 0.40 | 0.00 | | +| 24 | Challenge | Noun, adj., and verb | 0.49 | 0.00 | 0.00 | | 0.55 | 0.20 | 0.05 | 0.15 | 0.00 | 0.00 | | +| | Average (Easy) | | 0.59 | 0.38 | 0.25 | | 0.56 | 0.18 | 0.11 | 0.56 | 0.28 | 0.34 | | +| Average (Normal) | | | 0.63 | 0.18 | 0.14 | | 0.64 | 0.18 | 0.02 | 0.58 | 0.23 | 0.19 | | +| Average (Challenge) | | | 0.63 | 0.18 | 0.10 | | 0.50 | 0.15 | 0.01 | 0.49 | 0.08 | 0.08 | | +| | | | | | | | | | | | | | | + +Table 5: Baseline model performance on each of the three scoring metrics *(task completion, task process, explanatory knowledge discovery)* across all 10 unit test tasks.Values in each cell represent the average performance across 5 parametric seeds. Unit tests tasks are run to a maximum of 100 steps. + +| | | | ReACT | | Plan+Execute | Hypothesizer | | | +|----|----------------------------------------|-----------|------------|-----------|--------------|--------------|------------|--| +| # | Unit Test Topic | Procedure | Completion | Procedure | Completion | Procedure | Completion | | +| 25 | Multi-turn dialog with an agent | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | | +| 26 | Measure an object with an instrument | 0.87 | 0.60 | 0.73 | 0.40 | 1.00 | 1.00 | | +| 27 | Pick-and-place object | 0.90 | 0.80 | 0.80 | 0.60 | 1.00 | 1.00 | | +| 28 | Pick-and-give object | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | | +| 29 | Read DiscoveryFeed posts | 1.00 | 1.00 | 0.90 | 0.80 | 1.00 | 1.00 | | +| 30 | Move through doors | 0.58 | 0.20 | 0.25 | 0.00 | 0.30 | 0.00 | | +| 31 | Using keys with doors | 0.69 | 0.20 | 0.54 | 0.00 | 0.69 | 0.00 | | +| 32 | Navigate to a specific room in a house | 0.20 | 0.20 | 0.20 | 0.00 | 0.20 | 0.20 | | +| 33 | Search an environment for an object | 0.80 | 0.80 | 0.60 | 0.60 | 1.00 | 1.00 | | +| 34 | Interact with a moving agent | 0.60 | 0.20 | 0.53 | 0.00 | 0.53 | 0.20 | | +| | Average (Unit Tests) | | 0.60 | 0.66 | 0.44 | 0.77 | 0.64 | | + +## 4.2 Baseline Agent Models + +The baseline agents are described below, with model performance on Discovery tasks shown in Table 4, and performance on Unit Tests shown in Table 5. We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide \ No newline at end of file diff --git a/olmocr/bench/sample_data/marker/multi_column_miss.md b/olmocr/bench/sample_data/marker/multi_column_miss.md index cd1e877..a52d7ab 100644 --- a/olmocr/bench/sample_data/marker/multi_column_miss.md +++ b/olmocr/bench/sample_data/marker/multi_column_miss.md @@ -1,3 +1,5 @@ +Advocacy in Action 447 + stakeholders has occurred in other nations, with groups and individuals refusing to risk being appropriated into the industry's public relations ambitions. It now looks like that with vigilance, tobacco control advocates can easily foment similar distaste in many areas of the business community. Our actions sought to denormalise the tobacco industry by disrupting its efforts to take its place alongside other industries—often with considerable social credit—in the hope that it might gain by association. Tobacco industry posturing about its corporate responsibility can never hide the ugly consequences of its ongoing efforts to ''work with all relevant stakeholders for the preservation of opportunities for informed adults to consume tobacco products''1 (translation: ''we will build alliances with others who want to profit from tobacco use, to do all we can to counteract effective tobacco control''). BAT has 15.4% and Philip Morris 16.4% of the global cigarette market.6 With 4.9 million smokers currently dying from tobacco use each year, and the industry unblinkingly concurring that its products are addictive, this leaves BAT to argue why it should not be held to be largely accountable for the annual deaths of some 754 600 smokers, and Philip Morris some 803 600 smokers. @@ -18,7 +20,9 @@ Corporate social responsibility and the tobacco industry: hope or hype? ## N Hirschhorn -............................................................................................................................... Tobacco Control 2004;13:447–453. doi: 10.1136/tc.2003.006676 +............................................................................................................................... + +Tobacco Control 2004;13:447–453. doi: 10.1136/tc.2003.006676 Corporate social responsibility (CSR) emerged from a realisation among transnational corporations of the need to account for and redress their adverse impact on society: specifically, on human rights, labour practices, and the environment. Two transnational tobacco companies have recently adopted CSR: Philip Morris, and British American Tobacco. This report explains the origins and theory behind CSR; examines internal company documents from Philip Morris showing the company's deliberations on the matter, and the company's perspective on its own behaviour; and reflects on whether marketing tobacco is antithetical to social responsibility. @@ -28,11 +32,7 @@ Correspondence to: Dr Norbert Hirschhorn, Nastolantie 6, A3 00600 Helsinki, Finl ....................... -Received 13 November 2003 Accepted 15 July 2004 ....................... Over the past three decades increasing pressure from non-governmental organisations (NGOs), governments and the United Nations, has required transnational corporations (TNCs) to examine and redress the adverse impact their businesses have on society and the environment. Many have responded by taking up what is known as ''corporate social responsibility'' (CSR); only recently have two major cigarette companies followed suit: Philip Morris (PM) and British American Tobacco (BAT). This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version. This paper examines whether a - -tobacco company espousing CSR should be judged simply as a corporate entity along - -standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron. +Received 13 November 2003 Accepted 15 July 2004 ....................... Over the past three decades increasing pressure from non-governmental organisations (NGOs), governments and the United Nations, has required transnational corporations (TNCs) to examine and redress the adverse impact their businesses have on society and the environment. Many have responded by taking up what is known as ''corporate social responsibility'' (CSR); only recently have two major cigarette companies followed suit: Philip Morris (PM) and British American Tobacco (BAT). This report first provides the context and development of CSR; then, from internal company documents, examines how PM came to its own version. This paper examines whether a tobacco company espousing CSR should be judged simply as a corporate entity along standards of business ethics, or as an irretrievably negative force in the realm of public health, thereby rendering CSR an oxymoron. ## CORPORATE SOCIAL RESPONSIBILITY: THE CONTEXT diff --git a/olmocr/bench/sample_data/mineru/discoverworld_crazy_table4.md b/olmocr/bench/sample_data/mineru/discoverworld_crazy_table4.md new file mode 100644 index 0000000..9cb88dc --- /dev/null +++ b/olmocr/bench/sample_data/mineru/discoverworld_crazy_table4.md @@ -0,0 +1,13 @@ +Table 4: Baseline model performance on each of the three scoring metrics (task completion, task process, explanatory knowledge discovery) across all 24 DISCOVERYWORLD tasks. Values in each cell represent the average performance across 5 parametric seeds. Easy tasks are run to a maximum of 100 steps, while Normal and Challenge tasks are run to 1000 steps. + + +
ReACTPlan+ExecuteHypothesizer
edure
# TopicTask
Proteomics 1 EasyClusteringSimplified Clustering0.870.200.200.800.000.000.900.401.00
2NormalClustering (2D)0.880.400.400.680.200.000.930.400.40
3ChallengeClustering (3D)0.880.400.600.580.200.000.930.400.60
Exploring Combinations and Hill Climbing
Chemistry 4Single substances0.871.001.000.700.40
Easy0.600.900.000.40
5 NormalMix of 3 substances0.820.000.000.870.400.000.930.600.40
6 ChallengeMix of 4 substances0.900.400.000.900.400.000.870.000.00
ArchaeologyCorrelations
7 EasySimple instrument0.270.600.000.330.200.000.600.200.50
8 9NormalInstrument Use0.72 0.460.40 0.200.30 0.000.740.000.000.640.400.40
ChallengeCorrelation0.460.000.050.550.200.05
Reactor Lab EasyRegressionSlope only
10 110.42 0.440.000.400.440.000.100.380.000.20
Normal 12Linear regression0.430.00 0.000.20 0.200.490.000.000.510.000.00
Challenge Plant NutrientsQuadraticregression0.390.000.000.390.000.00
13Uncovering systems of rules Simplified rules0.800.200.200.200.200.60
Easy 14NormalPresence rules0.910.600.000.700.400.560.000.00
15Logical Rules0.890.400.000.840.400.000.000.00
Challenge0.730.000.620.000.00
Space SickOpen-endeddiscovery Single instrument0.780.600.00
16 Easy Normal0.680.400.100.801.000.60
17 18Multiple instruments0.580.000.130.450.000.130.160.000.33
Challenge Novel instruments0.550.000.000.260.000.000.200.000.00
Rocket Science EasyMulti-step measurements and applying formulas
19Look-up variables0.330.000.000.530.000.070.130.400.00
20 NormalMeasure 2 variables0.510.00 0.050.340.000.000.110.000.00
21 ChallengeMeasure 5 variables0.430.000.000.150.000.000.220.000.03
Translation 22Rosetta-stone style linguistic discovery of alienlanguage0.40
Easy 23Single noun Noun and verb0.400.200.300.000.00 0.000.20 0.840.200.00
24Normal ChallengeNoun, adj., and verb0.20 0.490.00 0.000.00 0.000.68 0.550.40 0.200.050.150.40 0.000.00 0.00
Average (Easy)0.560.280.34
Average (Normal)0.59 0.630.38 0.180.25 0.140.56 0.640.18 0.180.11 0.02
+ +Table 5: Baseline model performance on each of the three scoring metrics (task completion, task process, explanatory knowledge discovery) across all 10 unit test tasks.Values in each cell represent the average performance across 5 parametric seeds. Unit tests tasks are run to a maximum of 100 steps. + + +
UnitTestTopicReACT ProcedurePlan+ExecuteHypothesizer
CompletionProcedureCompletionProcedure
25Multi-turndialogwithanagent1.001.001.001.001.001.00
26Measure an objectwith aninstrument0.870.600.730.401.001.00
27Pick-and-place object0.900.800.800.601.001.00
28Pick-and-give object1.001.001.001.001.001.00
29ReadDiscoveryFeedposts1.001.000.900.801.001.00
30Movethroughdoors0.580.200.250.000.300.00
31Usingkeys with doors0.690.200.540.000.690.00
32Navigate to a specificroomin a house0.200.200.200.000.200.20
33Searchanenvironmentforanobject0.800.800.600.601.001.00
34Interactwithamovingagent0.600.200.530.000.530.20
Average (Unit Tests)0.760.600.660.440.770.64
+ +# 4.2 Baseline Agent Models + +The baseline agents are described below, with model performance on Discovery tasks shown in Table 4, and performance on Unit Tests shown in Table 5. We use the GPT-4O model for all our agents due to its higher performance and lower cost compared to other models. For space we provide \ No newline at end of file