-
Notifications
You must be signed in to change notification settings - Fork 1
/
model_evaluation.py
153 lines (119 loc) · 5.29 KB
/
model_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch
import pandas as pd
import fire
from datasets import Dataset
import numpy as np
from tqdm import tqdm
import mysql.connector
import os
def main(
model_name: str,
tokenizer_path: str,
dataset_path: str,
output_path: str = "Model_outputs",
input_max: int = 64,
output_max: int = 256
):
model_name = model_name
tokenizer_path = tokenizer_path
dataset_path = dataset_path
input_max = input_max
output_max = output_max
parent_directory = output_path
if torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
print(device)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
model.eval()
df = pd.read_csv(dataset_path)
df = df.sample(frac =1).reset_index(drop=True)
for index,row in df.iterrows():
df.loc[index, 'Text'] = "Translate to SQL: " + row['Text']
test_set_seen = Dataset.from_pandas(df)
test_set_seen.set_format(type = "torch")
print(test_set_seen["SQL"][1])
print(test_set_seen["Text"][1])
def convert_to_features(example_batch, padding = "max_length",input_max = input_max, output_max = output_max):
inputs = tokenizer.batch_encode_plus(example_batch["Text"], max_length=input_max, is_split_into_words = False, padding=padding, truncation=True, return_tensors = "pt")
targets = tokenizer.batch_encode_plus(example_batch["SQL"], max_length=output_max, padding = padding,truncation = True)
if padding == "max_length":
targets["inputs_ids"] = [
[(l if l != tokenizer.pad_token_id else -100) for l in target] for target in targets["input_ids"]
]
inputs["labels"] = targets['input_ids']
return inputs
def evaluate_peft_model(sample):
outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), max_length = 200, top_p=0.9)
prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
label = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
label = tokenizer.decode(label, skip_special_tokens=True)
execution_accuracy(prediction, label)
def execution_accuracy(prediction, label):
try:
cursor.execute(label)
result_label = cursor.fetchall()
all_executions_overall.append(1)
try:
cursor.execute(prediction)
result_pred = cursor.fetchall()
all_executions_accuracy.append(1)
if len(result_label)>10:
if len(result_label) == len(result_pred):
accurate_executions.append(1)
elif result_label == result_pred:
accurate_executions.append(1)
else:
for_checking_label.append(label)
for_checking_prediction.append(prediction)
except:
failed_executions.append(1)
failed_predicted_SQL.append(prediction)
except:
failed_label_SQL.append(label)
connection = mysql.connector.connect(
host="relational.fit.cvut.cz",
user="guest",
password="relational",
database="imdb_ijs"
)
cursor = connection.cursor()
print("mapping both datasets")
tokenized_dataset = test_set_seen.map(convert_to_features, batched=True, num_proc=4)
print("mapped both dataset")
print("Document we have: tokenized_dataset for seen data")
print("\n\n Running executions for seen dataset")
all_executions_overall = []
failed_executions = []
all_executions_accuracy = []
accurate_executions = []
for_checking_label = []
for_checking_prediction = []
failed_label_SQL = []
failed_predicted_SQL = []
with tqdm(total=len(tokenized_dataset), ncols=100, ascii=True) as pbar:
for sample in tokenized_dataset:
evaluate_peft_model(sample)
pbar.set_postfix({'Ac/Ex': accurate_executions})
pbar.update()
print("All SQL runs: ", len(all_executions_overall))
print("Model SQLs that failed: ", len(failed_executions))
print(f"Execution rate: {len(all_executions_accuracy)/len(all_executions_overall)*100}%")
print(f"Execution rate: {100 - len(failed_executions)/len(all_executions_overall)*100}%")
print(f"Execution accuracy: {len(accurate_executions)/len(all_executions_accuracy)*100}%")
failed_label_sql_df = pd.DataFrame(failed_label_SQL)
failed_predicted_sql_df = pd.DataFrame(failed_predicted_SQL)
not_equals = pd.DataFrame({
'Label':for_checking_label,
'Prediction': for_checking_prediction
})
if not os.path.exists(parent_directory):
os.makedirs(parent_directory)
not_equals.to_csv(parent_directory+ "/" + model_name+ "_Not_accurate.csv", index = False)
failed_label_sql_df.to_csv(parent_directory+"/" + model_name + "_Failed_labels.csv", index = False)
failed_predicted_sql_df.to_csv(parent_directory+ "/" + model_name +"_Failed_predicted.csv", index = False)
if __name__ == "__main__":
fire.Fire(main)