-
Notifications
You must be signed in to change notification settings - Fork 1
/
openbenchmarking.py
1526 lines (1245 loc) · 54 KB
/
openbenchmarking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 27 13:03:26 2016
@author: davidovitch
"""
import os
from os.path import join as pjoin
from glob import glob
import shutil
import re
import gc
import hashlib
import json
from io import StringIO
from lxml.html import fromstring
from lxml import (etree, objectify)
import urllib.request
from tqdm import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
class DataFrameDict:
"""Utilities for handling and checking the consistancy of a dictionary
in DataFrame format
"""
def check_column_length(df_dict):
"""Make sure all columns have the same number of rows
"""
collens = {}
for col, values in df_dict.items():
collens[col] = len(values)
if len(set(collens.values())) > 1:
for col, val in collens.items():
print('%6i : %s' % (val, col))
return collens
def trim_cell_len(df_dict, maxlens):
"""Trim maximum length of a certain columns/cells
"""
col0 = list(df_dict.keys())[0]
for i in range(len(df_dict[col0])):
for col, maxlen in maxlens.items():
if isinstance(df_dict[col][i], str):
df_dict[col][i] = df_dict[col][i][:maxlen]
return df_dict
def check_cell_len(df_dict):
"""List maximum cell length per column
"""
maxcellen, celltype = {}, {}
for name, column in df_dict.items():
maxcellen[name], celltype[name] = [], []
for cell in column:
celltype[name].append(str(type(cell)))
if isinstance(cell, str):
maxcellen[name].append(len(cell))
else:
maxcellen[name].append(0)
return maxcellen, celltype
class OpenBenchMarking:
# add an optional HTML header when downloading content with urllib
header = {}
def __init__(self):
self.pts_path = pjoin(os.environ['HOME'], '.phoronix-test-suite')
self.res_path = pjoin(self.pts_path, 'test-results-all-obm/')
self.db_path = pjoin(os.environ['HOME'], '.phoronix-test-suite')
self.url_base = 'http://openbenchmarking.org/result/{}&export=xml'
self.url_search = 'http://openbenchmarking.org/s/{}&show_more'
self.url_latest = 'http://openbenchmarking.org/results/latest'
self.url_test = 'http://openbenchmarking.org/test/pts/{}'
self.url_test_search = 'http://openbenchmarking.org/test/pts/{}&search'
self.url_test_base = 'http://openbenchmarking.org/tests/pts'
self.hard_soft_tags = set(['Hardware', 'Software'])
self.testid = 'unknown'
self.user_cols = ['User', 'SystemDescription', 'testid', 'Notes',
'SystemIdentifier', 'GeneratedTitle', 'LastModified']
self.testid_cache = None
self.parser = etree.XMLParser(remove_comments=True)
def make_cache_set(self):
"""Create set of all testids present in the res_path directory.
"""
fpath = os.path.join(self.res_path, '**', '*.xml')
self.testid_cache = set(
[os.path.basename(k)[:-4] for k in glob(fpath, recursive=True)])
def load_testid_from_obm(self, testid, use_cache=True, save_xml=False):
"""Download a given testid from OpenBenchmarking.org.
Parameters
----------
testid : str
OpenBenchemarking.org testid, for example: 1606281-HA-RX480LINU80
"""
self.testid = testid
if not use_cache:
self.testid_cache = set([])
elif self.testid_cache is None:
self.make_cache_set()
if testid not in self.testid_cache:
self.load_testid(self.url_base.format(testid))
in_cache = False
else:
fname = self.get_testid_fname()
self.load_testid(fname)
in_cache = True
if save_xml and not in_cache:
self.write_testid_xml()
def load_testid(self, io):
# at some point the XML files suddenly had the following header
# <?xml version="1.0"?>, which it didn't like
# does ignoring comments help?
# https://stackoverflow.com/q/18313818/3156685
if io[:7] == 'http://':
# tree = fromstring(self.get_url(io))
# tree = etree.parse(StringIO(self.get_url(io)))
tree = objectify.parse(StringIO(self.get_url(io)),
parser=self.parser)
else:
# tree = etree.parse(io)
tree = objectify.parse(io, parser=self.parser)
self.root = tree.getroot()
self.io = io
def get_url(self, url):
url = urllib.parse.quote(url, safe='/:')
req = urllib.request.Request(url=url, headers=self.header)
response = urllib.request.urlopen(req)
data = response.read() # a bytes object
text = data.decode('utf-8') # str; can't be used if data is binary
# lxml doesn't like this header
# text = text.replace('<?xml version="1.0"?>\n', '')
# text = text.replace('<!--Phoronix Test Suite v8.2.0-->\n', '')
return text
def get_profiles(self, url):
"""Return a list of test profile id's from a given OBM url. URL is
parsed with safe.
Parameters
----------
url : str
Returns
-------
ids : list
List of test profile id's
"""
tree = fromstring(self.get_url(url))
# all profiles names are in h4 elements, and nothing else is, nice
# and easy but if a title is given, the id is in the parent link
# url starts with /result/
ids = [k.getparent().attrib['href'][8:] for k in tree.cssselect('h4')]
return ids
def get_testid_fname(self, testid=None):
"""
"""
if testid is None:
testid = self.testid
yy = testid[:2]
mm = testid[2:4]
return pjoin(self.res_path, yy, mm, testid + '.xml')
def get_tests(self):
"""Return a list with all PTS tests.
"""
tree = fromstring(self.get_url(self.url_test_base))
tests = []
for h4 in tree.cssselect('h4'):
if len(h4) < 1:
continue
# link starts with /test/pts/
tests.append(h4.getchildren()[0].attrib['href'][10:])
return tests
def write_testid_xml(self):
"""Write testid xml file to res_path/yy/mm/testid.xml
"""
fname = self.get_testid_fname()
fpath = os.path.dirname(fname)
if not os.path.isdir(fpath):
os.makedirs(fpath)
with open(fname, 'w') as f:
f.write(etree.tostring(self.root).decode())
class EditXML(OpenBenchMarking):
def __init__(self):
super().__init__()
def merge(self, list_test_results):
"""DOESN'T MERGE ANYTHING YET
"""
self.root = etree.Element('PhoronixTestSuite')
for test_result in list_test_results:
fname = self.get_testid_fname(testid=test_result)
tree = etree.parse(fname)
root = tree.getroot()
def write_local(self, test_result=None):
if test_result is None:
test_result = self.test_result
fname = self.get_testid_fname(testid=test_result)
fpath = os.path.dirname(fname)
if not os.path.isdir(fpath):
os.makedirs(fpath)
with open(fname, 'w') as f:
f.write(etree.tostring(self.root).decode())
def remove(self, search_tests, search_hardware, search_descr):
sys_rem = []
id_rename = {}
# root.Generated, root.System, root.Result
for el in self.root.findall('System'):
print('--------', el.tag)
# only keep RX 480 systems
hardware = el.find('Hardware')
hardware_els = hardware.text.split(', ')
hardware_dict = {k.split(':')[0]:k.split(':')[1] for k in hardware_els}
# for k in hardware_els:
# print("'" + k.split(':')[0] + "', ", end='')
# software = el.find('Software')
# software_els = software.text.split(', ')
# software_dict = {k.split(':')[0]:k.split(':')[1] for k in software_els}
# for k in software_els:
# print("'" + k.split(':')[0] + "', ", end='')
# cpu = hardware_dict['Processor'].strip()
# try:
# kernel = ' - ' + hardware_dict['Kernel'].strip()
# except:
# kernel = ''
# id_rename[el.find('Identifier').text] = cpu + kernel
cpu = hardware_dict['Processor'].strip() + ' - '
identifier = el.find('Identifier').text
id_rename[identifier] = cpu.split(' @ ')[0] + ' // ' + identifier
if hardware.text.find(search_hardware) > -1:
print(hardware_dict['Processor'])
else:
sys_rem.append(el.find('Identifier').text)
el.getparent().remove(el)
sys_rem = set(sys_rem)
for el in self.root.findall('Result'):
print('--------', el.tag)
# is this the right test at the right description?
try:
find_id = el.find('Identifier').text.find(search_tests)
find_descr = el.find('Description').text.find(search_descr)
except:
find_id = -1
find_descr = -1
if find_id > -1 and find_descr > -1:
print(el.find('Identifier').text)
# only keep systems that have not been filtered out
for entry in el.find('Data').getchildren():
sys_id = entry.find('Identifier').text
if sys_id in sys_rem:
entry.getparent().remove(entry)
else:
el.getparent().remove(el)
return id_rename
def cleanup(self, id_rename):
# keep track of all systems that have test results, remove the ones
# that have been filtered out
sys_list = []
for el in self.root.findall('Result'):
for entry in el.find('Data').getchildren():
identifier = entry.find('Identifier')
sys_list.append(identifier.text)
# rename identifiers
# entry.set('Identifier', id_rename[identifier.text])
identifier.text = id_rename[identifier.text]
sys_list = set(sys_list)
for el in self.root.findall('System'):
identifier = el.find('Identifier')
if identifier.text not in sys_list:
el.getparent().remove(el)
else:
# rename identifier
# el.set('Identifier', id_rename[identifier.text])
identifier.text = id_rename[identifier.text]
class xml2df(OpenBenchMarking):
"""Class for converting a PTS/OpenBenchmarking.org XML result file into a
table formatted as a Pandas.DataFrame object.
"""
def __init__(self, io=None, testid=None):
"""
Parameters
----------
io : path, default=None
File path to the testid file to be loaded, or anything else that
lxml.etree.parse(io) can handle.
testid : str, default=None
testid of the result file to be loaded. If not available locally
it will be downloaded from OpenBenchmarking.org.
"""
super().__init__()
if io is not None:
self.load_testid(io)
elif testid is not None:
self.load_testid_from_obm(testid, use_cache=True)
def _rename_dict_key(self, df_dict, rename):
"""rename a key in a dictionary"""
for key, value in rename.items():
df_dict[value] = df_dict[key]
df_dict.pop(key)
return df_dict
def xml2dict(self):
"""Convert the loaded XML file into a DataFrame ready dictionary of
lists.
"""
dict_sys = self.generated_system2dict()
rename = {'Identifier':'SystemIdentifier',
'Title':'GeneratedTitle'}
#'JSON':'SystemJSON', 'Description':'GeneratedDescription',
dict_sys = self._rename_dict_key(dict_sys, rename)
dict_res = self.data_entry2dict()
rename = {'JSON':'DataEntryJSON',
'Description':'ResultDescription',
'Identifier':'ResultIdentifier',
'Title':'ResultTitle'}
dict_res = self._rename_dict_key(dict_res, rename)
# add columns for the system data
for col in dict_sys:
if col in dict_res:
raise KeyError('{} already in df_dict'.format(col))
dict_res[col] = []
# for each data result entry, add the system col values
for sys_index in dict_res['SystemIndex']:
for col, val in dict_sys.items():
dict_res[col].append(dict_sys[col][sys_index])
dict_res.pop('SystemIndex')
# doing this as DataFrames is very expensive considering the small
# amount of data per XML file, and the large number of XML files.
# df = pd.merge(df_sys, df_res, left_index=True, right_on='SystemIndex')
# # after merging both, SystemIndex is now obsolete
# df.drop('SystemIndex', inplace=True, axis=1)
return dict_res
def xml2dict_split(self):
"""Convert the loaded XML file into a DataFrame ready dictionary of
lists, but keep system and results separate and create a unique
overlapping index.
"""
dict_sys = self.generated_system2dict() #maxlen=200
rename = {'Identifier':'SystemIdentifier',
'Title':'GeneratedTitle'}
#'JSON':'SystemJSON', 'Description':'GeneratedDescription',
dict_sys = self._rename_dict_key(dict_sys, rename)
maxlens = {'Memory' : 100,
'Disk' : 100,
'Graphics' : 96,
'GeneratedTitle' : 115,
'Motherboard' : 75,
'Network' : 93,
'Screen Resolution' : 12}
dict_sys = self._trim_cell_len(dict_sys, maxlens)
dict_res = self.data_entry2dict() #maxlen=60
dict_res = split_json(dict_res)
rename = {'Description':'ResultDescription',
'Identifier':'ResultIdentifier',
'Title':'ResultTitle',
'compiler':'DataEntryCompiler',
'compiler-type':'DataEntryCompilerType',
'max-result':'DataEntryMaxResult',
'min-result':'DataEntryMinResult'}
dict_res = self._rename_dict_key(dict_res, rename)
maxlens = {'AppVersion' : 50,
'Scale' : 50,
'DataEntryIdentifierExtra' : 41,
'ResultDescription' : 149}
dict_res = self._trim_cell_len(dict_res, maxlens)
# prepare full length but empty SystemHash column in dict_sys
nr_systems = len(dict_sys['SystemIdentifier'])
dict_sys['SystemHash'] = ['']*nr_systems
# Add a system hash column for each row
text = ''
for irow in range(nr_systems):
text = ''
for col in dict_sys.keys():
text += str(dict_sys[col][irow])
md5hash = hashlib.md5(text.encode('utf-8')).hexdigest()
dict_sys['SystemHash'][irow] = md5hash
# add SystemHash column to results
dict_res['SystemHash'] = []
# for each data result entry, add the system col values
for sys_index in dict_res['SystemIndex']:
md5hash = dict_sys['SystemHash'][sys_index]
dict_res['SystemHash'].append(md5hash)
dict_res.pop('SystemIndex')
return dict_res, dict_sys
def dict2df(self, dict_res):
"""Convert a df_dict to a DataFrame and convert columns to proper
c-type variable names and values.
RawString is a series of values separated by :
Value can be a series of values sperated by ,
"""
# split the Value column into a float and array part
if 'Value' in dict_res:
dict_res['ValueArray'] = []
for i, valstring in enumerate(dict_res['Value']):
if valstring is None:
valstring = ''
valarray = np.fromstring(valstring, sep=',')
# if we have more then one element it is a series, otherwise
# just a single value
if len(valarray) > 1:
dict_res['Value'][i] = np.nan
dict_res['ValueArray'].append(valarray)
elif len(valarray)==0:
dict_res['Value'][i] = np.nan
dict_res['ValueArray'].append(np.array([np.nan]))
else:
dict_res['Value'][i] = valarray[0]
dict_res['ValueArray'].append(np.array([np.nan]))
# RawString will allways (?) hold more than one value
if 'RawString' in dict_res:
for i, valstring in enumerate(dict_res['RawString']):
# FIXME: reading empty field in xml is set to None seems?
if valstring is None:
valarray = np.array([np.nan])
else:
valarray = np.fromstring(valstring, sep=':')
dict_res['RawString'][i] = valarray
# convert to dataframe, set datatypes
df = pd.DataFrame(dict_res)
# convert all column names to c-name compatible
df.rename(columns=lambda x: x.replace('-', '').replace(' ', ''),
inplace=True)
return df
def _split2dict(self, string):
"""Convert following string to dictionary:
key1: value1, key2: value2, ...
"""
# some old result files have (Total Cores: #) instead of (# Cores)
elements = string.replace('Total Cores:', 'Total Cores').split(', ')
return {k.split(':')[0]:k.split(':')[1] for k in elements}
def _add2row(self, elements, columns, df_dict, missing_val='',
rename={}, ignore=set([])):
"""
Elements with the tag Hardware and Software are split into multiple
columns.
Parameters
----------
elements : list of lmxl elements
columns : set
columns names present in df_dict
df_dict : dict
pandas.DataFrame dictionary
missing_val : str, default=''
When an tag occurs in columns but not in elements, it is added to
df_dict with missing_val as value. Rename is applied after the
missing keys from columns are checked
rename : dict, default={}
ignore : set, default=set([])
Ignore elements.
Returns
------
df_dict : dict
pandas.DataFrame dictionary with one added row for all the columns
of the set columns. Elements should be a sub-set of columns.
Values occuring in columns but not in elements are added with the
value as set in the missing_val variable.
"""
# make sure that all containing elements are used, and that
# missing ones are filled in as empty to preserve a valid
# DataFrame dictionary
found_els = []
for el in elements:
if el.tag in ignore:
continue
# TODO: should be like split_info methods for cpu, gpu, memory
if el.tag in self.hard_soft_tags:
# Here the columns HardwareHash and SoftwareHash are created.
# split the Hardware and Software tags into the columns
tmp = self._split2dict(el.text)
# add hash to have a unique identifier for each configuration
md5hash = hashlib.md5(el.text.encode('utf-8')).hexdigest()
tmp[el.tag + 'Hash'] = md5hash
for key, value in tmp.items():
df_dict[key].append(value)
found_els.append(key)
else:
df_dict[el.tag].append(el.text)
found_els.append(el.tag)
# populate missing keys with an empty value
for key in columns - set(found_els):
df_dict[key].append(missing_val)
# rename a certain column
for key, value in rename.items():
df_dict[value] = df_dict[key]
df_dict.pop(key)
return df_dict
def _trim_cell_len(self, df_dict, maxlens):
"""Trim maximum length of a certain columns/cells
"""
col0 = list(df_dict.keys())[0]
for i in range(len(df_dict[col0])):
for col, maxlen in maxlens.items():
if isinstance(df_dict[col][i], str):
df_dict[col][i] = df_dict[col][i][:maxlen]
return df_dict
def generated_system2dict(self, missing_val=''):
"""For a given xml result file from pts/OpenBenchmarking.org, convert
the Generated and System tags to a Pandas DataFrame. This means that
the data contained in the Generated tag will now be repeated for each
of the systems contained in the System tag.
Now we duplicated data among different rows, which helps when
searching/selecting.
The Hardware and Software tags are split into multiple columns to
facilitate a more fine grained searching and selection process.
Following columns are ignored from the Generated group to avoid too
long column values: Description
Following columns are ingored from the System group to avoid too long
columns values: JSON
"""
generated = ['Title', 'LastModified', 'TestClient', 'Description',
'Notes', 'InternalTags', 'ReferenceID',
'PreSetEnvironmentVariables']
gen_ignore = set(['Description'])
system = ['Identifier', 'Hardware', 'Software', 'User', 'TimeStamp',
'TestClientVersion', 'Notes', 'JSON', 'System Layer']
sys_ignore = set(['JSON']) # 'Notes'
hardware = ['Processor', 'Motherboard', 'Chipset', 'Memory', 'Disk',
'Graphics', 'Audio', 'Network', 'Monitor', 'HardwareHash']
software = ['OS', 'Kernel', 'Desktop', 'Display Server',
'Display Driver', 'OpenGL', 'OpenCL', 'Vulkan', 'Compiler',
'File-System', 'Screen Resolution', 'SoftwareHash']
cols_sys = system + hardware + software
# Remove columns we do not want to use because they are too long
cols_sys.remove('Hardware')
cols_sys.remove('Software')
for k in sys_ignore:
cols_sys.remove(k)
generated_set = set(generated) - set(gen_ignore)
system_set = set(cols_sys)
# hardware_set = set(hardware)
# software_set = set(software)
dict_sys = {k:[] for k in cols_sys}
# els_generated = self.root.findall('Generated')
# dict_sys = self._add2row(els_generated, generated_set, dict_sys)
#
# for key, value in dict_sys.items():
# print(key.rjust(28), len(value))
#
# els_generated = self.root.findall('System')
# dict_sys = self._add2row(els_generated, system_set, dict_sys)
#
# for key, value in dict_sys.items():
# print(key.rjust(28), len(value))
# there should only be one Generated element
gen_elements = self.root.findall('Generated')
# FIXME: data checks should take place in a xml check method/class!
assert len(gen_elements) == 1
# create dictionary of the tags/values in Generated
dict_gen = {el.tag : el.text for el in gen_elements[0]
if el.tag not in gen_ignore}
# Are there any suprises in the tags? Tags we haven't seen before?
assert len(dict_gen.keys() - (generated_set | gen_ignore)) == 0
# add empty values for possible missing keys
for key in generated_set - set(dict_gen.keys()):
dict_gen[key] = missing_val
# also include the URL testid identifier which is unique for each
# test entry on OpenBenchmarking.org
dict_gen['testid'] = self.testid
# For each system create a row in the df_dict
systems = self.root.findall('System')
for sys_els in systems:
dict_sys = self._add2row(sys_els, system_set, dict_sys,
ignore=sys_ignore)
# sanity checks
for key, value in dict_sys.items():
if not len(systems) == len(value):
rpl = [key, len(value), len(systems)]
msg = '{} has {} elements instead of {}'.format(*rpl)
raise AssertionError(msg)
# expand with the same values for Generated columns, this duplication
# of data will make searching/selecting in the DataFrame later easier
# FIXME: data duplication of the Generated tags for all test runs
for key, value in dict_gen.items():
dict_sys[key] = [value]*len(systems)
# for key, value in dict_sys.items():
# print(key.rjust(28), len(value))
# the indices for Identifier in Results/Data/Entry
self.ids = {key:i for i,key in enumerate(dict_sys['Identifier'])}
return dict_sys#, dict_gen
def data_entry2dict(self, missing_val=''):
"""For a given xml result file from pts/OpenBenchmarking.org, convert
the Result tags to a Pandas DataFrame. This means that the data
contained in the Result tag will now be replicated for each of the
Data/Entry tags for that given test result.
"""
# ResultOf indicates whether the result belongs to another result
# for example, corresponding CPU usage, render time per frame.
# ResultOf is not defined in the XML source.
result = ['Identifier', 'Title', 'AppVersion', 'Arguments', 'ResultOf',
'Description', 'Scale', 'Proportion', 'DisplayFormat']
res_ignore = set(['Arguments']) # , 'Description'
data_entry = ['Identifier', 'Value', 'RawString', 'JSON']
dat_ignore = set([])#'JSON'])
data_entry_ = set(['DataEntryIdentifier'] + data_entry[1:]) - dat_ignore
data_set = set(data_entry) - dat_ignore
rename = {'Identifier':'DataEntryIdentifier'}
dict_res = {k:[] for k in set(result) - res_ignore | data_entry_}
res_elements = list(self.root.findall('Result'))
res_title = ''
for res_el in res_elements:
# get all the details of the test in question
res_id = {k.tag:k.text for k in res_el if k.tag not in res_ignore}
res_id.pop('Data')
data_entries = res_el.find('Data')
# if the result identifier is empty, it corresponds to the previous
# result (usually CPU/frame time for LINE_GRAPH). Add one
# additional check: result title should be the same
if res_id['Identifier'] is not missing_val:
res_id_val = res_id['Identifier']
res_title = res_id['Title']
res_id['ResultOf'] = 'no'
elif res_id['Title'] == res_title:
res_id['Identifier'] = res_id_val
res_id['Title'] = res_title
res_id['ResultOf'] = 'yes'
# some cases just have no result identifier and do not belong to
# another test
else:
res_id['ResultOf'] = 'na'
# add each result to the collection
tmp = {k:[] for k in data_set}
for entry in data_entries:
tmp = self._add2row(entry, data_set, tmp, ignore=dat_ignore)
# before merging, rename Identifier
tmp = self._rename_dict_key(tmp, rename)
# add with the rest
for key, value in tmp.items():
dict_res[key].extend(value)
# and add the Result element columns to all the data entries
# FIXME: Data duplication for of the Result tags for each run
for key, value in res_id.items():
dict_res[key].extend([value]*len(data_entries))
# add the index of the System as a column because SystemIdentifier is
# not unique! Will be used when merging/joining with Generated/system
dict_res['SystemIndex'] = []
# when having multiple bars per group, the data/entry identifier
# can be shorter compared to the system identifier, and also contains
# an additional label: "EXTRA LABEL: SHORT SYSTEM ID"
dict_res['DataEntryIdentifierExtra'] = []
dict_res['DataEntryIdentifierShort'] = []
for identifier in dict_res['DataEntryIdentifier']:
idf_split = identifier.split(': ')
if len(idf_split) == 2:
idf_short = idf_split[1]
dict_res['DataEntryIdentifierExtra'].append(idf_split[0])
dict_res['DataEntryIdentifierShort'].append(idf_split[1])
# find the long version of the identifier
for idf in self.ids:
if idf.find(idf_short) > -1:
dict_res['SystemIndex'].append(self.ids[idf])
break
else:
dict_res['SystemIndex'].append(self.ids[identifier])
dict_res['DataEntryIdentifierExtra'].append(missing_val)
dict_res['DataEntryIdentifierShort'].append(missing_val)
# for key, value in dict_res.items():
# print(key.rjust(28), len(value))
return dict_res
class DataBase(xml2df):
def __init__(self):
super().__init__()
self.regex = re.compile(r'^[0-9]{7}\-[A-Za-z0-9]*\-[A-Za-z0-9]*$')
def load_db(self):
fname = pjoin(self.db_path, 'database_results.h5')
df = pd.read_hdf(fname, 'table')
fname = pjoin(self.db_path, 'database_systems.h5')
df_sys = pd.read_hdf(fname, 'table')
return df, df_sys
def get_hdf_stores(self):
fname = pjoin(self.db_path, 'database_results.h5')
self.store = pd.HDFStore(fname, mode='a', format='table',
complib='blosc', compression=9)
fname = pjoin(self.db_path, 'database_systems.h5')
self.store_sys = pd.HDFStore(fname, mode='a', format='table',
complib='blosc', compression=9)
def build(self, debug=False):
"""Build complete database from scratch, over write existing.
"""
df_dict, df_dict_sys = self.testids2dict()
if debug:
DataFrameDict.check_column_length(df_dict)
DataFrameDict.check_column_length(df_dict_sys)
maxcellen1, celltype1 = DataFrameDict.check_cell_len(df_dict)
maxcellen2, celltype2 = DataFrameDict.check_cell_len(df_dict_sys)
return
print('Convert results from df_dict to df')
df = self.dict2df(df_dict)
print('df cleanup')
df = self.cleanup(df)
print('save results to disk')
fname = pjoin(self.db_path, 'database_results.h5')
# df.drop(['ValueArray', 'RawString'], axis=1, inplace=True)
df.to_hdf(fname, 'table', format='table', complib='blosc', mode='w',
compression=9, data_columns=True)
del df_dict
del df
gc.collect()
print('Convert systems from df_dict to df')
df = self.dict2df(df_dict_sys)
print('df cleanup')
df = self.cleanup(df)
print('save results to disk')
fname = pjoin(self.db_path, 'database_systems.h5')
# df.drop(['ValueArray', 'RawString'], axis=1, inplace=True)
df.to_hdf(fname, 'table', format='table', complib='blosc', mode='w',
compression=9, data_columns=True)
del df_dict_sys
del df
gc.collect()
def update(self, testids=None):
"""Load existing database and add testids that haven't been added.
"""
df, df_sys = self.load_db()
if testids is None:
# already included testid's
testids_df = set(df_sys['testid'].unique())
# all downloaded testids
self.make_cache_set()
# only add testids that are on disk but not in the database
testids = self.testid_cache - testids_df
print('\nupdating with %i new testids' % len(testids))
df_dict, df_dict_sys = self.testids2dict(testids=testids)
self.get_hdf_stores()
df = self.dict2df(df_dict)
df = self.cleanup(df, i0=len(df))
# https://stackoverflow.com/a/15499291
# use data_columns=True for robustness: process column by column and
# raise when a data type is being offended
self.store.append('table', df, data_columns=True)
self.store.close()
df = self.dict2df(df_dict_sys)
df = self.cleanup(df, i0=len(df))
self.store_sys.append('table', df, data_columns=True)
self.store_sys.close()
gc.collect()
def testids2dict(self, testids=None):
"""Load all local test id xml files and convert to pandas.DataFrame
dictionaries.
Parameters
----------
testids : iterable, default=None
Iterable holding of those testid's to be converted to dicts.
If None, all testid's stored at xml.res_path are considered.
Returns
-------
df_dict, df_dict_sys : pandas.DataFrame dictionary
"""
df_dict = None
df_dict_sys = None
# consider all testids if None
if testids is None:
# make a list of all available test id folders
self.make_cache_set()
testids = self.testid_cache
regex = re.compile(r'^[0-9]{7}\-[A-Za-z0-9]*\-[A-Za-z0-9]*$')
i = 0
for testid in tqdm(testids):
# if i > 10000:
# break
regex.findall(testid)
if len(regex.findall(testid)) != 1:
continue
self.testid = testid
self.load_testid(self.get_testid_fname(testid=testid))
i += 1
try:
# _df_dict = xml2dict()
_df_dict, _df_dict_sys = self.xml2dict_split()
except Exception as e:
print('')
print('conversion to df_dict of {} failed.'.format(testid))
print(e)
continue
# make sure we have a consistant df
k1 = set([len(val) for key, val in _df_dict.items()])
k2 = set([len(val) for key, val in _df_dict_sys.items()])
if len(k1) > 1 or len(k2) > 1:
DataFrameDict.check_column_length(_df_dict)
DataFrameDict.check_column_length(_df_dict_sys)
print('conversion to df_dict of {} failed.'.format(testid))
continue
if df_dict is None:
df_dict = {key:[] for key in _df_dict}
if df_dict_sys is None:
df_dict_sys = {key:[] for key in _df_dict_sys}
for key, val in _df_dict.items():
df_dict[key].extend(val)
for key, val in _df_dict_sys.items():
df_dict_sys[key].extend(val)
return df_dict, df_dict_sys
def cleanup(self, df, i0=0):
# FIXME: is it safe to ignore array columns when looking for duplicates?
# there are probably going to be more duplicates
# doesn't work for ndarray columns
arraycols = set(['RawString', 'ValueArray', 'testruntimes'])
cols = list(set(df.columns) - arraycols)
df.drop_duplicates(inplace=True, subset=cols)
# columns that can hold different values but still could refer to the same
# test data. So basically all user defined columns should be ignored.
# But do not drop the columns, just ignore them for de-duplication
cols = list(set(df.columns) - set(self.user_cols) - arraycols)
# mark True for values that are NOT duplicates
df = df.loc[np.invert(df.duplicated(subset=cols).values)]
# split the Processer column in Processor info, frequency and cores
if 'Processor' in df.columns:
df = split_cpu_info(df)
# trim all columns
for col in df:
if df[col].dtype==np.object:
df[col] = df[col].str.strip()
# convert object columns to string, but leave other data types as is
# ignore columns with very long strings to avoid out of memory errors
# RawString and ValueArray can contain a time series
ignore = set(['Value']) | arraycols
for col, dtype in df.dtypes.items():
if dtype==np.object and col not in ignore:
print('cleanup col:', col)
try:
# df[col] = df[col].astype('category')
df[col] = df[col].values.astype(np.str)
except Exception as e:
print(col)
raise e
# # leave array data in different dataframe
# df_arr = df[['testid', 'ValueArray', 'Rawstring']]
# trim all columns
for col in df:
if df[col].dtype==np.object:
df[col] = df[col].str.strip()
# create a new unique index
df.index = i0 + np.arange(len(df))
# remove all spaces in column names
# new_cols = {k:k.replace(' ', '').replace('-', '') for k in df.columns}
# df.rename(columns=new_cols, inplace=True)
return df
def search_openbm(search=None, save_xml=True, use_cache=True, tests=[],
latest=False):
"""
Parameters
----------