From b916ceff7711c596326a78e0946640d2f122d9e2 Mon Sep 17 00:00:00 2001
From: Felix Ocker <felix.ocker@honda-ri.de>
Date: Mon, 19 Aug 2024 15:55:37 +0200
Subject: [PATCH] fix(log_analysis): use SD instead of VAR, default settings to
 reduced type

---
 src/eval/math_eval/log_analysis.py         | 27 +++++++++++++---------
 src/eval/math_eval/math_eval_settings.yaml |  6 ++---
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/eval/math_eval/log_analysis.py b/src/eval/math_eval/log_analysis.py
index ba2718b..63a158d 100644
--- a/src/eval/math_eval/log_analysis.py
+++ b/src/eval/math_eval/log_analysis.py
@@ -28,13 +28,13 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 import importlib
+import itertools
 import json
 import logging.config
 import os
 import re
 import shutil
 import statistics
-import itertools
 from collections import Counter
 from dataclasses import dataclass
 from datetime import datetime
@@ -156,17 +156,22 @@ def do_significance_test(all_results):
 
         # two-sided test, H0: no difference
         test = scipy.stats.wilcoxon(pairs_x, pairs_y)
-        print(f"'{agents[0]}' - '{agents[1]}': no difference, H0 is dropped: {test.pvalue = }")
+        print(
+            f"'{agents[0]}' - '{agents[1]}': no difference, H0 is dropped: {test.pvalue = }"
+        )
         # one-sided less, H0 y greater than x, H1 y less than x
         test = scipy.stats.wilcoxon(pairs_x, pairs_y, alternative="less")
-        print(f"'{agents[0]}' - '{agents[1]}': second is greater H0 dropped for lesser H1: {test.pvalue = }")
+        print(
+            f"'{agents[0]}' - '{agents[1]}': second is greater H0 dropped for lesser H1: {test.pvalue = }"
+        )
         # one-sided less, H0 y lesser than x, H1 y greater than x
         test = scipy.stats.wilcoxon(pairs_x, pairs_y, alternative="greater")
-        print(f"'{agents[0]}' - '{agents[1]}': second is lesser H0 dropped for great H1: {test.pvalue = }")
+        print(
+            f"'{agents[0]}' - '{agents[1]}': second is lesser H0 dropped for great H1: {test.pvalue = }"
+        )
         print()
 
 
-
 def interquartile_mean(values: list) -> float:
     lnv = len(values)
     q = lnv // 4
@@ -261,7 +266,7 @@ def extract_data_from_log(
         r = Result(
             agent=agent,
             task=query,
-            task_id='None',
+            task_id="None",
             run=run,
             model=model,
             embedding_model=embedding_model,
@@ -423,7 +428,7 @@ def plot(
                 variation = [scipy.stats.iqr(s) for s in scores]
             else:
                 variation = [
-                    np.var([statistics.mean(s) for s in scores_by_level])
+                    np.std([statistics.mean(s) for s in scores_by_level])
                     for scores_by_level in scores_by_run
                 ]
             number_of_scores = [len(e) for e in scores]
@@ -477,7 +482,7 @@ def plot(
 
     tight = (0, 0, 1, 0.83)
     plt.tight_layout(rect=tight)
-    # plt.savefig(output_file, bbox_inches="tight", dpi=300)
+    plt.savefig(output_file, bbox_inches="tight", dpi=300)
     plt.show()
     return result_dict
 
@@ -676,9 +681,9 @@ def sanity_check_results(
         # "logs/math.eval.20240808-0858.log",  # gpt4 turbo, lvl 4
         # "logs/math.eval.20240809-0848.log",  # gpt4 turbo, lvl 5
         # "logs/math.eval.20240812-1339.log",  # gpt4omin, full lib, lvl 1-3
-        "logs/math.eval.20240619-1357.log", # gpt-3.5-turbo, our math, 5 runs
+        # "logs/math.eval.20240619-1357.log",  # gpt-3.5-turbo, our math, 5 runs
     ]
-    history_file = "history_eval" # to use different history files
+    history_file = "history"  # to use different history files
 
     with open("math_eval_settings.yaml", "rt") as mes:
         settings = yaml.safe_load(mes.read())
@@ -713,7 +718,7 @@ def sanity_check_results(
                 a
                 for a in history_data[log_name]["agents"]
                 if history_data[log_name]["agents"][a]
-                and a != "BaseAgent"  # to exclude base agent in plots
+                # and a != "BaseAgent"  # to exclude base agent in plots
             ]
             colors = [history_data[log_name]["colors"][a] for a in agents]
             number_of_runs = history_data[log_name]["number_of_runs"]
diff --git a/src/eval/math_eval/math_eval_settings.yaml b/src/eval/math_eval/math_eval_settings.yaml
index 088f5cd..b153374 100644
--- a/src/eval/math_eval/math_eval_settings.yaml
+++ b/src/eval/math_eval/math_eval_settings.yaml
@@ -1,12 +1,12 @@
 # Settings for math eval
 
-benchmark_type: math
+benchmark_type: reduced
 ground_truth: math_tasks.json
 number_of_runs: 1
 log_folder: logs
 log_file: null
 task_filter: null
-run_sanity_checks: false
+run_sanity_checks: true
 plot_cost_distribution: false
 
 tools: math_tools
@@ -39,4 +39,4 @@ colors:
     InformedCotTulipAgent: "#9900ff"
     PrimedCotTulipAgent: "#cc66ff"
     OneShotCotTulipAgent: "#cc33ff"
-    AutoTulipAgent: "#660066"
\ No newline at end of file
+    AutoTulipAgent: "#660066"