Skip to content

Commit

Permalink
fix(log_analysis): use SD instead of VAR, default settings to reduced…
Browse files Browse the repository at this point in the history
… type
  • Loading branch information
felixocker committed Aug 19, 2024
1 parent b7c0ecb commit b916cef
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 14 deletions.
27 changes: 16 additions & 11 deletions src/eval/math_eval/log_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
import importlib
import itertools
import json
import logging.config
import os
import re
import shutil
import statistics
import itertools
from collections import Counter
from dataclasses import dataclass
from datetime import datetime
Expand Down Expand Up @@ -156,17 +156,22 @@ def do_significance_test(all_results):

# two-sided test, H0: no difference
test = scipy.stats.wilcoxon(pairs_x, pairs_y)
print(f"'{agents[0]}' - '{agents[1]}': no difference, H0 is dropped: {test.pvalue = }")
print(
f"'{agents[0]}' - '{agents[1]}': no difference, H0 is dropped: {test.pvalue = }"
)
# one-sided less, H0 y greater than x, H1 y less than x
test = scipy.stats.wilcoxon(pairs_x, pairs_y, alternative="less")
print(f"'{agents[0]}' - '{agents[1]}': second is greater H0 dropped for lesser H1: {test.pvalue = }")
print(
f"'{agents[0]}' - '{agents[1]}': second is greater H0 dropped for lesser H1: {test.pvalue = }"
)
# one-sided less, H0 y lesser than x, H1 y greater than x
test = scipy.stats.wilcoxon(pairs_x, pairs_y, alternative="greater")
print(f"'{agents[0]}' - '{agents[1]}': second is lesser H0 dropped for great H1: {test.pvalue = }")
print(
f"'{agents[0]}' - '{agents[1]}': second is lesser H0 dropped for great H1: {test.pvalue = }"
)
print()



def interquartile_mean(values: list) -> float:
lnv = len(values)
q = lnv // 4
Expand Down Expand Up @@ -261,7 +266,7 @@ def extract_data_from_log(
r = Result(
agent=agent,
task=query,
task_id='None',
task_id="None",
run=run,
model=model,
embedding_model=embedding_model,
Expand Down Expand Up @@ -423,7 +428,7 @@ def plot(
variation = [scipy.stats.iqr(s) for s in scores]
else:
variation = [
np.var([statistics.mean(s) for s in scores_by_level])
np.std([statistics.mean(s) for s in scores_by_level])
for scores_by_level in scores_by_run
]
number_of_scores = [len(e) for e in scores]
Expand Down Expand Up @@ -477,7 +482,7 @@ def plot(

tight = (0, 0, 1, 0.83)
plt.tight_layout(rect=tight)
# plt.savefig(output_file, bbox_inches="tight", dpi=300)
plt.savefig(output_file, bbox_inches="tight", dpi=300)
plt.show()
return result_dict

Expand Down Expand Up @@ -676,9 +681,9 @@ def sanity_check_results(
# "logs/math.eval.20240808-0858.log", # gpt4 turbo, lvl 4
# "logs/math.eval.20240809-0848.log", # gpt4 turbo, lvl 5
# "logs/math.eval.20240812-1339.log", # gpt4omin, full lib, lvl 1-3
"logs/math.eval.20240619-1357.log", # gpt-3.5-turbo, our math, 5 runs
# "logs/math.eval.20240619-1357.log", # gpt-3.5-turbo, our math, 5 runs
]
history_file = "history_eval" # to use different history files
history_file = "history" # to use different history files

with open("math_eval_settings.yaml", "rt") as mes:
settings = yaml.safe_load(mes.read())
Expand Down Expand Up @@ -713,7 +718,7 @@ def sanity_check_results(
a
for a in history_data[log_name]["agents"]
if history_data[log_name]["agents"][a]
and a != "BaseAgent" # to exclude base agent in plots
# and a != "BaseAgent" # to exclude base agent in plots
]
colors = [history_data[log_name]["colors"][a] for a in agents]
number_of_runs = history_data[log_name]["number_of_runs"]
Expand Down
6 changes: 3 additions & 3 deletions src/eval/math_eval/math_eval_settings.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# Settings for math eval

benchmark_type: math
benchmark_type: reduced
ground_truth: math_tasks.json
number_of_runs: 1
log_folder: logs
log_file: null
task_filter: null
run_sanity_checks: false
run_sanity_checks: true
plot_cost_distribution: false

tools: math_tools
Expand Down Expand Up @@ -39,4 +39,4 @@ colors:
InformedCotTulipAgent: "#9900ff"
PrimedCotTulipAgent: "#cc66ff"
OneShotCotTulipAgent: "#cc33ff"
AutoTulipAgent: "#660066"
AutoTulipAgent: "#660066"

0 comments on commit b916cef

Please sign in to comment.