From b916ceff7711c596326a78e0946640d2f122d9e2 Mon Sep 17 00:00:00 2001 From: Felix Ocker Date: Mon, 19 Aug 2024 15:55:37 +0200 Subject: [PATCH] fix(log_analysis): use SD instead of VAR, default settings to reduced type --- src/eval/math_eval/log_analysis.py | 27 +++++++++++++--------- src/eval/math_eval/math_eval_settings.yaml | 6 ++--- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/eval/math_eval/log_analysis.py b/src/eval/math_eval/log_analysis.py index ba2718b..63a158d 100644 --- a/src/eval/math_eval/log_analysis.py +++ b/src/eval/math_eval/log_analysis.py @@ -28,13 +28,13 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # import importlib +import itertools import json import logging.config import os import re import shutil import statistics -import itertools from collections import Counter from dataclasses import dataclass from datetime import datetime @@ -156,17 +156,22 @@ def do_significance_test(all_results): # two-sided test, H0: no difference test = scipy.stats.wilcoxon(pairs_x, pairs_y) - print(f"'{agents[0]}' - '{agents[1]}': no difference, H0 is dropped: {test.pvalue = }") + print( + f"'{agents[0]}' - '{agents[1]}': no difference, H0 is dropped: {test.pvalue = }" + ) # one-sided less, H0 y greater than x, H1 y less than x test = scipy.stats.wilcoxon(pairs_x, pairs_y, alternative="less") - print(f"'{agents[0]}' - '{agents[1]}': second is greater H0 dropped for lesser H1: {test.pvalue = }") + print( + f"'{agents[0]}' - '{agents[1]}': second is greater H0 dropped for lesser H1: {test.pvalue = }" + ) # one-sided less, H0 y lesser than x, H1 y greater than x test = scipy.stats.wilcoxon(pairs_x, pairs_y, alternative="greater") - print(f"'{agents[0]}' - '{agents[1]}': second is lesser H0 dropped for great H1: {test.pvalue = }") + print( + f"'{agents[0]}' - '{agents[1]}': second is lesser H0 dropped for great H1: {test.pvalue = }" + ) print() - def interquartile_mean(values: list) -> float: lnv = len(values) q = lnv // 4 @@ -261,7 +266,7 @@ def extract_data_from_log( r = Result( agent=agent, task=query, - task_id='None', + task_id="None", run=run, model=model, embedding_model=embedding_model, @@ -423,7 +428,7 @@ def plot( variation = [scipy.stats.iqr(s) for s in scores] else: variation = [ - np.var([statistics.mean(s) for s in scores_by_level]) + np.std([statistics.mean(s) for s in scores_by_level]) for scores_by_level in scores_by_run ] number_of_scores = [len(e) for e in scores] @@ -477,7 +482,7 @@ def plot( tight = (0, 0, 1, 0.83) plt.tight_layout(rect=tight) - # plt.savefig(output_file, bbox_inches="tight", dpi=300) + plt.savefig(output_file, bbox_inches="tight", dpi=300) plt.show() return result_dict @@ -676,9 +681,9 @@ def sanity_check_results( # "logs/math.eval.20240808-0858.log", # gpt4 turbo, lvl 4 # "logs/math.eval.20240809-0848.log", # gpt4 turbo, lvl 5 # "logs/math.eval.20240812-1339.log", # gpt4omin, full lib, lvl 1-3 - "logs/math.eval.20240619-1357.log", # gpt-3.5-turbo, our math, 5 runs + # "logs/math.eval.20240619-1357.log", # gpt-3.5-turbo, our math, 5 runs ] - history_file = "history_eval" # to use different history files + history_file = "history" # to use different history files with open("math_eval_settings.yaml", "rt") as mes: settings = yaml.safe_load(mes.read()) @@ -713,7 +718,7 @@ def sanity_check_results( a for a in history_data[log_name]["agents"] if history_data[log_name]["agents"][a] - and a != "BaseAgent" # to exclude base agent in plots + # and a != "BaseAgent" # to exclude base agent in plots ] colors = [history_data[log_name]["colors"][a] for a in agents] number_of_runs = history_data[log_name]["number_of_runs"] diff --git a/src/eval/math_eval/math_eval_settings.yaml b/src/eval/math_eval/math_eval_settings.yaml index 088f5cd..b153374 100644 --- a/src/eval/math_eval/math_eval_settings.yaml +++ b/src/eval/math_eval/math_eval_settings.yaml @@ -1,12 +1,12 @@ # Settings for math eval -benchmark_type: math +benchmark_type: reduced ground_truth: math_tasks.json number_of_runs: 1 log_folder: logs log_file: null task_filter: null -run_sanity_checks: false +run_sanity_checks: true plot_cost_distribution: false tools: math_tools @@ -39,4 +39,4 @@ colors: InformedCotTulipAgent: "#9900ff" PrimedCotTulipAgent: "#cc66ff" OneShotCotTulipAgent: "#cc33ff" - AutoTulipAgent: "#660066" \ No newline at end of file + AutoTulipAgent: "#660066"