-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_profiling_benchmarker.py
executable file
·112 lines (106 loc) · 4.87 KB
/
extract_profiling_benchmarker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
# Author: Sawood Alam <[email protected]>
#
# This scripts generates multiple profiles from the same collection with varying configurations and benchmarks them.
import os
import sys
import gzip
import json
import time
import re
from profile import Profile
from cdx_extract_profiler import CDXExtractProfiler
def write_json(jsonstr="{}", filepath="profile.json", compress=False):
"""Save JSON on local filesystem."""
print("Writing output to " + filepath)
f = open(filepath, "w")
f.write(jsonstr)
f.close()
if compress:
with open(filepath, "rb") as f_in:
f_out = gzip.open(filepath + ".gz", "wb")
while True:
block = f_in.read(1048576)
if block == "":
break
f_out.write(block)
f_out.close()
#zf = gzip.open(filepath + ".gz", "wb")
#zf.write(jsonstr)
#zf.close()
def build_profile(host, path):
print("Profiling {0} with Host: {1}, Path: {2}".format(collection, host, path))
bm_id = "H{0}P{1}".format(host, path)
profile_id = "{0}-{1}".format(col_id, bm_id)
profiling_start = time.time()
p = Profile(name="{0} Hosts {1} Paths {2}".format(collection, host, path),
description="{0} collection profile with maximum {1} host and {2} path secgment(s).".format(collection, host, path),
homepage="http://www.webarchive.org.uk/ukwa/",
accesspoint="http://www.webarchive.org.uk/wayback/",
memento_compliance="https://oduwsdl.github.io/terms/mementosupport#native",
timegate="http://www.webarchive.org.uk/wayback/archive/",
timemap="http://www.webarchive.org.uk/wayback/archive/timemap/link/",
established="2004",
profile_updated=time.strftime("%Y-%m-%dT%H:%M:%SZ"),
mechanism="https://oduwsdl.github.io/terms/mechanism#cdx")
cp = CDXExtractProfiler(max_host_segments=host,
max_path_segments=path,
global_stats=True)
cp.process_cdx_extracts(sys.argv[1:])
cdx_processing_done = time.time()
cp.calculate_stats()
stats_calculation_done = time.time()
p.stats = cp.stats
jsonstr = p.to_json()
opf = "profile-{0}.json".format(profile_id)
opfpath = os.path.join(bmdir, opf)
write_json(jsonstr, filepath=opfpath, compress=True)
profiling_done = time.time()
bm = {
"profile": opf,
"collection": col_id,
"max_host": host,
"max_path": path,
"cdx_size": cdx_size,
"extract_size": extract_size,
"profile_size": os.path.getsize(opfpath),
"profile_size_compressed": os.path.getsize(opfpath + ".gz"),
"urir_count": p.stats["urir"],
"urim_count": p.stats["urim"]["total"],
"suburi_keys": len(p.stats["suburi"]),
"time_keys": len(p.stats["time"]),
"mediatype_keys": len(p.stats["mediatype"]),
"language_keys": len(p.stats["language"]),
"cdx_processing_time": cdx_processing_done - profiling_start,
"stats_calculation_time": stats_calculation_done - cdx_processing_done,
"profiling_time": profiling_done - profiling_start
}
all_bms["bms"][bm_id] = bm
jsonstr = json.dumps(bm, sort_keys=True, indent=4, separators=(",", ": "))
opf = "bm-{0}.json".format(profile_id)
opfpath = os.path.join(bmdir, opf)
write_json(jsonstr, filepath=opfpath)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Please provide path(s) to CDX Extraxt file(s) as command line argument(s).")
sys.exit(0)
print("\n{0} => Running: {1}\n".format(time.strftime("%Y-%m-%d %H:%M:%S"), sys.argv))
benchmarking_start = time.time()
collection = os.getenv("COLLECTION", "Test CDX Extraxt")
col_id = re.sub("\W+", "-", collection.lower())
scriptdir = os.path.dirname(os.path.abspath(__file__))
bmdir = os.path.join(scriptdir, "extractbm", col_id)
if not os.path.exists(bmdir):
os.makedirs(bmdir)
extract_size = sum(os.path.getsize(f) for f in sys.argv[1:])
cdx_size = sum(os.path.getsize(f.replace(".curi", ".cdx")) for f in sys.argv[1:])
all_bms = {"about": {"id": col_id, "name": collection}, "bms": {}}
host_path_pairs = [(1, 0), (2, 0), (2, 1), (2, 2), (3, 0), (3, 1), (3, 2), (3, 3), (4, 0), (5, 0), ("x", 0), ("x", 1), ("x", 2), ("x", 3), ("x", 4), ("x", 5), ("x", "x")]
for host, path in host_path_pairs:
build_profile(host, path)
benchmarking_done = time.time()
all_bms["about"]["benchmarking_time"] = benchmarking_done - benchmarking_start
jsonstr = json.dumps(all_bms, sort_keys=True, indent=4, separators=(",", ": "))
opfpath = os.path.join(bmdir, "bm-{0}.json".format(col_id))
write_json(jsonstr, filepath=opfpath)
print("All Done! (Time: {0} minutes)\n".format(int(all_bms["about"]["benchmarking_time"]/60)))