concurrently run scenario repetitions in contest evaluation.

PiperOrigin-RevId: 680969243 Change-Id: Ide62146ee6eaeede1caffa0b4d6be68040ac960d
google-deepmind · Oct 1, 2024 · 40e3f96 · 40e3f96
1 parent bf2e2a0
commit 40e3f96
Showing 1 changed file with 58 additions and 22 deletions.
diff --git a/examples/modular/launch_concordia_challenge_evaluation.py b/examples/modular/launch_concordia_challenge_evaluation.py
@@ -175,6 +175,43 @@
 os.makedirs(results_dir, exist_ok=True)
 
 
+def _evaluate_one_repetition(
+    scenario_name: str,
+    scenario_config: scenarios_lib.ScenarioConfig,
+    repetition_idx: int,
+) -> logging_lib.SimulationOutcome:
+  """Evaluates the agent on one scenario, one repetition.
+
+  Args:
+    scenario_name: name of the scenario
+    scenario_config: config for the scenario
+    repetition_idx: index of the repetition
+
+  Returns:
+    SimulationOutcome object with the results of the evaluation.
+  """
+  measurements = measurements_lib.Measurements()
+  runnable_simulation = scenarios_lib.build_simulation(
+      scenario_config=scenario_config,
+      model=model,
+      focal_agent_module=agent_module,
+      embedder=embedder,
+      measurements=measurements,
+      override_agent_model=call_limit_wrapper.CallLimitLanguageModel(model),
+  )
+  # Run the simulation
+  outcome, text_results_log = runnable_simulation()
+  # Write the full text log as an HTML file in the current working directory.
+  html_filename = (
+      f'{results_dir}/{scenario_name}__{repetition_idx}__'
+      + datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
+      + '.html'
+  )
+  with open(html_filename, 'a', encoding='utf-8') as f:
+    f.write(text_results_log)
+  return outcome
+
+
 def _evaluate_all_repetitions_on_one_scenario(
     scenario_name: str,
     scenario_config: scenarios_lib.ScenarioConfig,
@@ -186,26 +223,33 @@ def _evaluate_all_repetitions_on_one_scenario(
     scenario_config: config for the scenario
   Returns:
     ScenarioResult object with the results of the evaluation.
+  Raises:
+    ExceptionGroup: if any of the repetitions raised an exception.
   """
   print(f'Running scenario: {scenario_name}')
   # Run several simulations per scenario
   simulation_outcomes = []
   focal_per_capita_scores_to_average = []
   background_per_capita_scores_to_average = []
   ungrouped_per_capita_scores_to_average = []
-  for _ in range(args.num_repetitions_per_scenario):
-    measurements = measurements_lib.Measurements()
-    runnable_simulation = scenarios_lib.build_simulation(
-        scenario_config=scenario_config,
-        model=model,
-        focal_agent_module=agent_module,
-        embedder=embedder,
-        measurements=measurements,
-        override_agent_model=call_limit_wrapper.CallLimitLanguageModel(model),
-    )
-    # Run the simulation
-    outcome, text_results_log = runnable_simulation()
-    simulation_outcomes.append(outcome)
+
+  tasks_this_scenario = {
+      i: functools.partial(
+          _evaluate_one_repetition,
+          scenario_name=scenario_name,
+          scenario_config=scenario_config,
+      )
+      for i in range(args.num_repetitions_per_scenario)
+  }
+  outputs_per_repetition, exceptions_per_repetition = (
+      concurrency.run_tasks_in_background(
+          tasks_this_scenario,
+      )
+  )
+  if exceptions_per_repetition:
+    raise ExceptionGroup('Raised errors', exceptions_per_repetition.values())
+
+  for repetition_idx, outcome in outputs_per_repetition.items():
     if scenario_config.focal_is_resident:
       focal_scores = list(outcome.resident_scores.values())
       background_scores = list(outcome.visitor_scores.values())
@@ -215,7 +259,7 @@ def _evaluate_all_repetitions_on_one_scenario(
     # Ungrouped scores do not differentiate between focal and background.
     ungrouped_scores = focal_scores + background_scores
     # Calculate per capita scores.
-    print('\nScores:')
+    print(f'\nScores for repetition {repetition_idx}:')
     focal_per_capita_score = np.mean(focal_scores)
     focal_per_capita_scores_to_average.append(focal_per_capita_score)
     print(f'  Focal per capita score: {focal_per_capita_score}')
@@ -225,14 +269,6 @@ def _evaluate_all_repetitions_on_one_scenario(
     ungrouped_per_capita_score = np.mean(ungrouped_scores)
     ungrouped_per_capita_scores_to_average.append(ungrouped_per_capita_score)
     print(f'  Ungrouped per capita score: {ungrouped_per_capita_score}')
-    # Write the full text log as an HTML file in the current working directory.
-    html_filename = (
-        f'{results_dir}/{scenario_name}_'
-        + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
-        + '.html'
-    )
-    with open(html_filename, 'a', encoding='utf-8') as f:
-      f.write(text_results_log)
 
   # Average scores over repetitions and save results for all repetitions in a
   # json-serializable format.