diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/reasoning_model_context.py b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/reasoning_model_context.py
new file mode 100644
index 000000000..c61dade13
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/reasoning_model_context.py
@@ -0,0 +1,17 @@
+from typing import List
+from autogen_core.model_context import UnboundedChatCompletionContext
+from autogen_core.models import AssistantMessage, LLMMessage
+
+
+class ReasoningModelContext(UnboundedChatCompletionContext):
+    """A model context for reasoning models."""
+
+    async def get_messages(self) -> List[LLMMessage]:
+        messages = await super().get_messages()
+        # Filter out thought field from AssistantMessage.
+        messages_out = []
+        for message in messages:
+            if isinstance(message, AssistantMessage):
+                message.thought = None
+            messages_out.append(message)
+        return messages_out
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/scenario.py b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/scenario.py
index 96d8cd968..097e02604 100644
--- a/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/scenario.py
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/AgentChat/scenario.py
@@ -5,9 +5,11 @@ from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
 from autogen_agentchat.teams import RoundRobinGroupChat
 from autogen_agentchat.ui import Console
 from autogen_core.models import ModelFamily
+from autogen_core.model_context import UnboundedChatCompletionContext, ChatCompletionContext
 from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
 from autogen_agentchat.conditions import TextMentionTermination
 from custom_code_executor import CustomCodeExecutorAgent
+from reasoning_model_context import ReasoningModelContext
 from autogen_core.models import ChatCompletionClient
 
 async def main() -> None:
@@ -17,11 +19,20 @@ async def main() -> None:
         config = yaml.safe_load(f)
     model_client = ChatCompletionClient.load_component(config["model_config"])
 
+    # Model context
+    model_context : ChatCompletionContext
+    if model_client.model_info["family"] == ModelFamily.R1:
+        model_context = ReasoningModelContext()
+    else:
+        model_context = UnboundedChatCompletionContext()
+
     # Coder
     coder_agent = MagenticOneCoderAgent(
         name="coder",
         model_client=model_client,
     )
+    # Set model context.
+    coder_agent._model_context = model_context # type: ignore
 
     # Executor
     executor = CustomCodeExecutorAgent(
diff --git a/python/packages/agbench/src/agbench/run_cmd.py b/python/packages/agbench/src/agbench/run_cmd.py
index 181088d44..a8a8161c5 100644
--- a/python/packages/agbench/src/agbench/run_cmd.py
+++ b/python/packages/agbench/src/agbench/run_cmd.py
@@ -426,13 +426,17 @@ fi
 # Run the scenario
 pip install -r requirements.txt
 echo SCENARIO.PY STARTING !#!#
+start_time=$(date +%s)
 timeout --preserve-status --kill-after {timeout  + 30}s {timeout}s python scenario.py
+end_time=$(date +%s)
 EXIT_CODE=$?
 if [ $EXIT_CODE -ne 0 ]; then
     echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
 else
     echo SCENARIO.PY COMPLETE !#!#
 fi
+elapsed_time=$((end_time - start_time))
+echo "SCENARIO.PY RUNTIME: $elapsed_time !#!#"
 
 # Clean up
 if [ -d .cache ] ; then
@@ -543,13 +547,17 @@ fi
 # Run the scenario
 pip install -r requirements.txt
 echo SCENARIO.PY STARTING !#!#
+start_time=$(date +%s)
 timeout --preserve-status --kill-after {timeout  + 30}s {timeout}s python scenario.py
+end_time=$(date +%s)
 EXIT_CODE=$?
 if [ $EXIT_CODE -ne 0 ]; then
     echo SCENARIO.PY EXITED WITH CODE: $EXIT_CODE !#!#
 else
     echo SCENARIO.PY COMPLETE !#!#
 fi
+elapsed_time=$((end_time - start_time))
+echo "SCENARIO.PY RUNTIME: $elapsed_time !#!#"
 
 # Clean up
 if [ -d .cache ] ; then
@@ -613,6 +621,7 @@ echo RUN.SH COMPLETE !#!#
         auto_remove=True,
         # Type hint of docker is wrong here
         volumes=volumes,  # type: ignore
+        network="host",  # Use the host network to avoid issues with localhost.
     )
 
     # Read the logs in a streaming fashion. Keep an eye on the time to make sure we don't need to stop.
@@ -930,9 +939,6 @@ def run_cli(args: Sequence[str]) -> None:
         if IS_WIN32:
             sys.exit("Running scenarios with --native is not supported in Windows. Exiting.")
 
-        if parsed_args.requirements is not None:
-            sys.exit("--requirements is not compatible with --native. Exiting.")
-
         sys.stderr.write(
             "WARNING: Running natively, without Docker, not only poses the usual risks of executing arbitrary AI generated code on your machine, it also makes it impossible to ensure that each test starts from a known and consistent set of initial conditions. For example, if the agents spend time debugging and installing Python libraries to solve the task, then those libraries will be available to all other runs. In other words, earlier runs can influence later runs, leading to many confounds in testing.\n\n"
         )
diff --git a/python/packages/agbench/src/agbench/tabulate_cmd.py b/python/packages/agbench/src/agbench/tabulate_cmd.py
index 2b1eb37b9..b48a9876c 100644
--- a/python/packages/agbench/src/agbench/tabulate_cmd.py
+++ b/python/packages/agbench/src/agbench/tabulate_cmd.py
@@ -1,9 +1,10 @@
 import argparse
 import os
+import re
 import sys
-from copy import deepcopy
-from typing import Any, Callable, List, Optional, Sequence, Tuple
+from typing import Any, Callable, Dict, List, Optional, Sequence
 
+import pandas as pd
 import tabulate as tb
 
 from .load_module import load_module
@@ -25,6 +26,8 @@ COMPLETED_STRINGS = [
 
 EXCLUDE_DIR_NAMES = ["__pycache__"]
 
+TIMER_REGEX = r"RUNTIME:\s*([\d.]+) !#!#"
+
 
 def find_tabulate_module(search_dir: str, stop_dir: Optional[str] = None) -> Optional[str]:
     """Hunt for the tabulate script."""
@@ -84,12 +87,32 @@ def default_scorer(instance_dir: str, success_strings: List[str] = SUCCESS_STRIN
         return None
 
 
+def default_timer(instance_dir: str, timer_regex: str = TIMER_REGEX) -> Optional[float]:
+    console_log = os.path.join(instance_dir, "console_log.txt")
+    if os.path.isfile(console_log):
+        with open(console_log, "rt") as fh:
+            content = fh.read()
+
+            # It succeeded
+            m = re.search(timer_regex, content)
+            if m:
+                return float(m.group(1))
+            else:
+                return None
+    else:
+        return None
+
+
 ScorerFunc = Callable[[str], Optional[bool]]
+TimerFunc = Callable[[str], Optional[float]]
 
 
 def default_tabulate(
-    args: List[str], scorer: ScorerFunc = default_scorer, exclude_dir_names: List[str] = EXCLUDE_DIR_NAMES
-) -> Tuple[argparse.Namespace, List[List[Any]]]:
+    args: List[str],
+    scorer: ScorerFunc = default_scorer,
+    timer: TimerFunc = default_timer,
+    exclude_dir_names: List[str] = EXCLUDE_DIR_NAMES,
+) -> None:
     invocation_cmd = args[0]
     args = args[1:]
 
@@ -119,7 +142,7 @@ def default_tabulate(
     parsed_args = parser.parse_args(args)
     runlogs: str = parsed_args.runlogs
 
-    all_results: List[List[Any]] = list()
+    all_results: List[Dict[str, Any]] = list()
     max_instances = 0
 
     for task_id in sorted(
@@ -135,116 +158,101 @@ def default_tabulate(
             continue
 
         # Collect the results vector
-        results: List[Any] = [task_id]
+        results: Dict[str, Any] = {"Task Id": task_id}
 
-        instance = 0
-        instance_dir = os.path.join(task_path, str(instance))
-        while os.path.isdir(instance_dir):
-            results.append(scorer(instance_dir))
-            instance += 1
+        # Collect the results for each instance.
+        instance_dirs = sorted(
+            os.listdir(task_path),
+            key=lambda s: os.path.getmtime(os.path.join(task_path, s)),
+        )
+        instances = [int(d) for d in instance_dirs if d.isdigit()]
+
+        for instance in instances:
             instance_dir = os.path.join(task_path, str(instance))
+            results[f"Trial {instance} Success"] = scorer(instance_dir)
+            results[f"Trial {instance} Time"] = timer(instance_dir)
 
-        max_instances = max(max_instances, instance)
+        max_instances = max(instances)
 
         # Buffer the results
         all_results.append(results)
 
+    num_instances = max_instances + 1
+
+    # Pad the results to max_instances
+    for result in all_results:
+        for i in range(num_instances):
+            if f"Trial {i} Success" not in result:
+                result[f"Trial {i} Success"] = None
+            if f"Trial {i} Time" not in result:
+                result[f"Trial {i} Time"] = None
+
+    # Create dataframe from results.
+    df = pd.DataFrame(all_results)
+
     if parsed_args.csv:
-        # Create a header
-        header = ["Task Id"]
-        for i in range(0, max_instances):
-            header.append("Trial " + str(i) + " Success")
-
-        print(",".join(header))
-        for row in all_results:
-            str_row = [f"{v}" if v is not None else "" for v in row]
-            while len(str_row) < max_instances + 1:
-                str_row.append("")
-            print(",".join(str_row))
-
+        # Print out the dataframe in CSV format
+        print(df.to_csv(index=False))
         # Print out alpha-version warning
         sys.stderr.write("\n" + warning + "\n\n")
     else:
-        # Create a header
-        header = ["\nTask Id"]
-        for i in range(0, max_instances):
-            header.append("Trial " + str(i) + "\nSuccess")
+        # Tabulate the results.
+        print(tb.tabulate(df, headers="keys", tablefmt="simple"))  # type: ignore
 
-        # Create the footer
-        def _count_equals(value: Optional[bool], trial: int) -> int:
-            count = 0
-            for row in all_results:
-                is_answer_matched = row[trial + 1][0] if isinstance(row[trial + 1], tuple) else row[trial + 1]
+        # Aggregate statistics for all tasks for each trials.
+        print("\nSummary Statistics\n")
+        score_columns = ["Trial " + str(i) + " Success" for i in range(num_instances)]
+        # Count the number of successes when the value is True.
+        successes = df[score_columns].apply(lambda x: x is True).sum(axis=0)  # type: ignore
+        # Count the number of failures when the value is False.
+        failures: pd.Series = df[score_columns].apply(lambda x: x is False).sum(axis=0)  # type: ignore
+        # Count the number of missing
+        missings = df[score_columns].isna().sum(axis=0)  # type: ignore
+        # Count the total number of instances
+        totals = successes + failures + missings  # type: ignore
+        # Calculate the average success rates
+        avg_success_rates = successes / (successes + failures)  # type: ignore
+        time_columns = ["Trial " + str(i) + " Time" for i in range(num_instances)]  # type: ignore
+        # Count the total time of non-null values
+        total_times = df[time_columns].sum(axis=0, skipna=True)  # type: ignore
+        # Calculate the average time of non-null values
+        avg_times = df[time_columns].mean(axis=0, skipna=True)  # type: ignore
 
-                # Count missing
-                if value is None:
-                    if trial + 1 < len(row):
-                        if is_answer_matched is None:
-                            count += 1
-                    else:
-                        count += 1
-                # Count match
-                elif trial + 1 < len(row) and is_answer_matched == value:
-                    count += 1
-            return count
+        # Create a per-trial summary dataframe
+        trial_df = pd.DataFrame(
+            {
+                "Successes": list(successes),  # type: ignore
+                "Failures": list(failures),  # type: ignore
+                "Missing": list(missings),  # type: ignore
+                "Total": list(totals),  # type: ignore
+                "Average Success Rate": list(avg_success_rates),  # type: ignore
+                "Average Time": list(avg_times),  # type: ignore
+                "Total Time": list(total_times),  # type: ignore
+            },
+            index=[f"Trial {i}" for i in range(num_instances)],
+        )
+        # Print out the per-trial summary dataframe.
+        print(tb.tabulate(trial_df, headers="keys", tablefmt="simple"))  # type: ignore
 
-        footer: List[Any] = []
-        footer_row: List[Any] = ["Successes"]
-        for i in range(0, max_instances):
-            footer_row.append(_count_equals(True, i))
-        footer.append(footer_row)
+        # Aggregate statistics across tasks for all trials.
+        # At least one success for each trial, averaged across tasks.
+        average_at_least_one_success = df[score_columns].any(axis=1).mean(skipna=True)  # type: ignore
+        # All successes for each trial
+        average_all_successes = df[score_columns].all(axis=1).mean(skipna=True)  # type: ignore
 
-        footer_row = ["Failures"]
-        for i in range(0, max_instances):
-            # count how many are not True, and not None, could be False or any other value
-            failures = 0
-            for row in all_results:
-                if isinstance(row[i + 1], tuple):
-                    failures += row[i + 1][0] not in [1, None]
-                else:
-                    failures += row[i + 1] not in [1, None]
-            footer_row.append(failures)
-        footer.append(footer_row)
-
-        footer_row = ["Missing"]
-        for i in range(0, max_instances):
-            footer_row.append(_count_equals(None, i))
-        footer.append(footer_row)
-
-        footer_row = ["Total"]
-        for i in range(0, max_instances):
-            footer_row.append(footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1])
-        footer.append(footer_row)
-
-        footer_row = ["Average Success Rate"]
-        for i in range(0, max_instances):
-            footer_row.append(_count_equals(True, i) / (footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1]))
-        footer.append(footer_row)
-
-        footer_row = ["Average Score"]
-        for i in range(0, max_instances):
-            avg_score_trial = 0.0
-            for row in all_results:
-                if isinstance(row[i + 1], tuple):
-                    avg_score_trial += row[i + 1][0]
-            avg_score_trial = avg_score_trial / len(all_results)
-            footer_row.append(avg_score_trial)
-        footer.append(footer_row)
-
-        table = deepcopy(all_results)
-        for row in table:
-            for trial in range(0, max_instances):
-                if isinstance(row[trial + 1], tuple):
-                    row[trial + 1] = row[trial + 1][0]
-
-        table.append(tb.SEPARATING_LINE)  # type: ignore
-        table.extend(footer)
-
-        print(tb.tabulate(table, headers=header))
+        # Create a dataframe
+        trial_aggregated_df = pd.DataFrame(
+            {
+                "At Least One Success": [average_at_least_one_success],  # type: ignore
+                "All Successes": [average_all_successes],  # type: ignore
+            },
+            index=["Trial Aggregated"],
+        )
+        # Print out the trial-aggregated dataframe.
+        print(tb.tabulate(trial_aggregated_df, headers="keys", tablefmt="simple"))  # type: ignore
 
         # Print out alpha-version warning
         sys.stderr.write("\n" + warning + "\n\n")
-    return parsed_args, all_results
 
 
 def tabulate_cli(args: Sequence[str]) -> None: