1#!/usr/bin/env python3 2# 3# Copyright 2018, The Android Open Source Project 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17""" 18Perform statistical analysis on measurements produced by app_startup_runner.py 19 20Install: 21$> sudo apt-get install python3-scipy 22 23Usage: 24$> ./analyze_metrics.py <filename.csv> [<filename2.csv> ...] 25$> ./analyze_metrics.py --help 26""" 27 28import argparse 29import csv 30import itertools 31import os 32import subprocess 33import sys 34import tempfile 35from typing import Any, List, Dict, Iterable, TextIO, Tuple 36 37from scipy import stats as sc 38import numpy as np 39 40 41# These CSV columns are considered labels. Everything after them in the same row are metrics. 42_LABEL_COLUMNS=['packages', 'readaheads', 'compiler_filters'] 43# The metric series with the 'cold' readahead is the baseline. 44# All others (warm, jit, etc) are the potential improvements. 45 46#fixme: this should probably be an option 47_BASELINE=('readaheads', 'cold') 48# ignore this for some statistic calculations 49_IGNORE_PAIR=('readaheads', 'warm') 50_PLOT_SUBKEY='readaheads' 51_PLOT_GROUPKEY='packages' 52_PLOT_DATA_INDEX = 0 53_DELTA=50 54_DELTA2=100 55_PVALUE_THRESHOLD=0.10 56_debug = False # See -d/--debug flag. 57 58def parse_options(argv: List[str] = None): 59 """Parse command line arguments and return an argparse Namespace object.""" 60 parser = argparse.ArgumentParser(description="Perform statistical analysis on measurements produced by app_start_runner.py.") 61 parser.add_argument('input_files', metavar='file.csv', nargs='+', help='CSV file produced by app_startup_runner.py') 62 63 parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='Add extra debugging output') 64 parser.add_argument('-os', '--output-samples', dest='output_samples', default='/dev/null', action='store', help='Store CSV for per-sample data') 65 parser.add_argument('-oc', '--output-comparable', dest='output_comparable', default='/dev/null', action='store', help='Output CSV for comparable against baseline') 66 parser.add_argument('-ocs', '--output-comparable-significant', dest='output_comparable_significant', default='/dev/null', action='store', help='Output CSV for comparable against baseline (significant only)') 67 parser.add_argument('-pt', '--pvalue-threshold', dest='pvalue_threshold', type=float, default=_PVALUE_THRESHOLD, action='store') 68 parser.add_argument('-dt', '--delta-threshold', dest='delta_threshold', type=int, default=_DELTA, action='store') 69 70 return parser.parse_args(argv) 71 72def _debug_print(*args, **kwargs): 73 """Print the args to sys.stderr if the --debug/-d flag was passed in.""" 74 global _debug 75 if _debug: 76 print(*args, **kwargs, file=sys.stderr) 77 78def _expand_gen_repr(args): 79 new_args_list = [] 80 for i in args: 81 # detect iterable objects that do not have their own override of __str__ 82 if hasattr(i, '__iter__'): 83 to_str = getattr(i, '__str__') 84 if to_str.__objclass__ == object: 85 # the repr for a generator is just type+address, expand it out instead. 86 new_args_list.append([_expand_gen_repr([j])[0] for j in i]) 87 continue 88 # normal case: uses the built-in to-string 89 new_args_list.append(i) 90 return new_args_list 91 92def _debug_print_gen(*args, **kwargs): 93 """Like _debug_print but will turn any iterable args into a list.""" 94 if not _debug: 95 return 96 97 new_args_list = _expand_gen_repr(args) 98 _debug_print(*new_args_list, **kwargs) 99 100def read_headers(input_file: TextIO) -> Tuple[List[str], List[str]]: 101 _debug_print("read_headers for file: ", input_file.name) 102 csv_reader = csv.reader(input_file) 103 104 label_num_columns = len(_LABEL_COLUMNS) 105 106 try: 107 header = next(csv_reader) 108 except StopIteration: 109 header = None 110 _debug_print('header', header) 111 112 if not header: 113 return (None, None) 114 115 labels = header[0:label_num_columns] 116 data = header[label_num_columns:] 117 118 return (labels, data) 119 120def read_labels_and_data(input_file: TextIO) -> Iterable[Tuple[List[str], List[int]]]: 121 _debug_print("print_analysis for file: ", input_file.name) 122 csv_reader = csv.reader(input_file) 123 124 # Skip the header because it doesn't contain any data. 125 # To get the header see read_headers function. 126 try: 127 header = next(csv_reader) 128 except StopIteration: 129 header = None 130 131 label_num_columns = len(_LABEL_COLUMNS) 132 133 for row in csv_reader: 134 if len(row) > 0 and row[0][0] == ';': 135 _debug_print("skip comment line", row) 136 continue 137 138 labels = row[0:label_num_columns] 139 data = [int(i) for i in row[label_num_columns:]] 140 141# _debug_print("labels:", labels) 142# _debug_print("data:", data) 143 144 yield (labels, data) 145 146def group_metrics_by_label(it: Iterable[Tuple[List[str], List[int]]]): 147 prev_labels = None 148 data_2d = [] 149 150 for label_list, data_list in it: 151 if prev_labels != label_list: 152 if prev_labels: 153# _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d) 154 yield (prev_labels, data_2d) 155 data_2d = [] 156 157 data_2d.append(data_list) 158 prev_labels = label_list 159 160 if prev_labels: 161# _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d) 162 yield (prev_labels, data_2d) 163 164def data_to_numpy(it: Iterable[Tuple[List[str], List[List[int]]]]) -> Iterable[Tuple[List[str], Any]]: 165 for label_list, data_2d in it: 166 yield (label_list, np.asarray(data_2d, dtype=int)) 167 168def iterate_columns(np_data_2d): 169 for col in range(np_data_2d.shape[1]): 170 col_as_array = np_data_2d[:, col] 171 yield col_as_array 172 173def confidence_interval(np_data_2d, percent=0.95): 174 """ 175 Given some data [[a,b,c],[d,e,f,]...] 176 177 We assume the same metric is in the column (e.g. [a,d]) 178 and that data in the rows (e.g. [b,e]) are separate metric values. 179 180 We then calculate the CI for each metric individually returning it as a list of tuples. 181 """ 182 arr = [] 183 for col_2d in iterate_columns(np_data_2d): 184 mean = col_2d.mean() 185 sigma = col_2d.std() 186 187 ci = sc.norm.interval(percent, loc=mean, scale=sigma / np.sqrt(len(col_2d))) 188 arr.append(ci) 189 190 # TODO: This seems to be returning NaN when all the samples have the same exact value 191 # (e.g. stddev=0, which can trivially happen when sample count = 1). 192 193 return arr 194 195def print_analysis(it, label_header: List[str], data_header: List[str], output_samples: str): 196 print(label_header) 197 198 with open(output_samples, "w") as output_file: 199 200 csv_writer = csv.writer(output_file) 201 csv_writer.writerow(label_header + ['mean', 'std', 'confidence_interval_a', 'confidence_interval_b']) 202 203 for label_list, np_data_2d in it: 204 print("**********************") 205 print(label_list) 206 print() 207 print(" ", data_header) 208 # aggregate computation column-wise 209 print("Mean: ", np_data_2d.mean(axis=0)) 210 print("Std: ", np_data_2d.std(axis=0)) 211 print("CI95%:", confidence_interval(np_data_2d)) 212 print("SEM: ", stats_standard_error_one(np_data_2d, axis=0)) 213 214 #ci = confidence_interval(np_data_2d)[_PLOT_DATA_INDEX] 215 sem = stats_standard_error_one(np_data_2d, axis=0)[_PLOT_DATA_INDEX] 216 mean = np_data_2d.mean(axis=0)[_PLOT_DATA_INDEX] 217 218 ci = (mean - sem, mean + sem) 219 220 csv_writer.writerow(label_list + [mean, np_data_2d.std(axis=0)[_PLOT_DATA_INDEX], ci[0], ci[1]]) 221 222def from_file_group_by_labels(input_file): 223 (label_header, data_header) = read_headers(input_file) 224 label_data_iter = read_labels_and_data(input_file) 225 grouped_iter = group_metrics_by_label(label_data_iter) 226 grouped_numpy_iter = data_to_numpy(grouped_iter) 227 228 return grouped_numpy_iter, label_header, data_header 229 230def list_without_index(list, index): 231 return list[:index] + list[index+1:] 232 233def group_by_without_baseline_key(grouped_numpy_iter, label_header): 234 """ 235 Data is considered comparable if the only difference is the baseline key 236 (i.e. the readahead is different but the package, compilation filter, etc, are the same). 237 238 Returns iterator that's grouped by the non-baseline labels to an iterator of 239 (label_list, data_2d). 240 """ 241 baseline_index = label_header.index(_BASELINE[0]) 242 243 def get_label_without_baseline(tpl): 244 label_list, _ = tpl 245 return list_without_index(label_list, baseline_index) 246 # [['pkgname', 'compfilter', 'warm'], [data]] 247 # [['pkgname', 'compfilter', 'cold'], [data2]] 248 # [['pkgname2', 'compfilter', 'warm'], [data3]] 249 # 250 # -> 251 # ( [['pkgname', 'compfilter', 'warm'], [data]] # ignore baseline label change. 252 # [['pkgname', 'compfilter', 'cold'], [data2]] ), # split here because the pkgname changed. 253 # ( [['pkgname2', 'compfilter', 'warm'], [data3]] ) 254 for group_info, it in itertools.groupby(grouped_numpy_iter, key = get_label_without_baseline): 255 yield it 256 257 # TODO: replace this messy manual iteration/grouping with pandas 258 259def iterate_comparable_metrics(without_baseline_iter, label_header): 260 baseline_index = label_header.index(_BASELINE[0]) 261 baseline_value = _BASELINE[1] 262 263 _debug_print("iterate comparables") 264 265 def is_baseline_fun(tp): 266 ll, dat = tp 267 return ll[baseline_index] == baseline_value 268 269 # iterating here when everything but the baseline key is the same. 270 for it in without_baseline_iter: 271 it1, it2 = itertools.tee(it) 272 273 # find all the baseline data. 274 baseline_filter_it = filter(is_baseline_fun, it1) 275 276 # find non-baseline data. 277 nonbaseline_filter_it = itertools.filterfalse(is_baseline_fun, it2) 278 279 yield itertools.product(baseline_filter_it, nonbaseline_filter_it) 280 281def stats_standard_error_one(a, axis): 282 a_std = a.std(axis=axis, ddof=0) 283 a_len = a.shape[axis] 284 285 return a_std / np.sqrt(a_len) 286 287def stats_standard_error(a, b, axis): 288 a_std = a.std(axis=axis, ddof=0) 289 b_std = b.std(axis=axis, ddof=0) 290 291 a_len = a.shape[axis] 292 b_len = b.shape[axis] 293 294 temp1 = a_std*a_std/a_len 295 temp2 = b_std*b_std/b_len 296 297 return np.sqrt(temp1 + temp2) 298 299def stats_tvalue(a, b, axis, delta = 0): 300 a_mean = a.mean(axis=axis) 301 b_mean = b.mean(axis=axis) 302 303 return (a_mean - b_mean - delta) / stats_standard_error(a, b, axis) 304 305def stats_pvalue(a, b, axis, delta, left:bool = False): 306 """ 307 Single-tailed 2-sample t-test. 308 309 Returns p-value for the null hypothesis: mean(a) - mean(b) >= delta. 310 :param a: numpy 2d array 311 :param b: numpy 2d array 312 :param axis: which axis to do the calculations across 313 :param delta: test value of mean differences 314 :param left: if true then use <= delta instead of >= delta 315 :return: p-value 316 """ 317 # implement our own pvalue calculation because the built-in t-test (t,p values) 318 # only offer delta=0 , e.g. m1-m1 ? 0 319 # we are however interested in m1-m2 ? delta 320 t_value = stats_tvalue(a, b, axis, delta) 321 322 # 2-sample degrees of freedom is using the array sizes - 2. 323 dof = a.shape[axis] + b.shape[axis] - 2 324 325 if left: 326 # left tailed test. e.g. m1-m2 <= delta 327 return sc.t.cdf(t_value, dof) 328 else: 329 # right tailed test. e.g. m1-m2 >= delta 330 return sc.t.sf(t_value, dof) 331 # a left+right tailed test is a 2-tail t-test and can be done using ttest_ind for delta=0 332 333def print_comparable_analysis(comparable_metrics_iter, label_header, data_header, output_comparable: str, output_comparable_significant: str): 334 baseline_value = _BASELINE[1] 335 baseline_index = label_header.index(_BASELINE[0]) 336 337 old_baseline_label_list = None 338 delta = _DELTA 339 filter_value = _IGNORE_PAIR[1] 340 filter_index = label_header.index(_IGNORE_PAIR[0]) 341 342 pvalue_threshold = _PVALUE_THRESHOLD 343 ci_threshold = (1 - _PVALUE_THRESHOLD) * 100.0 344 345 with open(output_comparable, "w") as output_file: 346 347 csv_writer = csv.writer(output_file) 348 csv_writer.writerow(label_header + ['mean', 'mean_diff', 'sem', 'pvalue_2tailed', 'pvalue_gt%d' %(_DELTA), 'pvalue_gt%d' %(_DELTA2)]) 349 350 print("------------------------------------------------------------------") 351 print("Comparison against the baseline %s = %s" %(_BASELINE, baseline_value)) 352 print("--- Right-tailed t-test checks if the baseline >= current %s by at least %d" %(_BASELINE[0], delta)) 353 print() 354 355 global_stats = {'better_than_delta': [], 'better_than_delta_p95': []} 356 357 for nested_it in comparable_metrics_iter: 358 print("************************") 359 360 better_than_delta = [] 361 better_than_delta_p95 = [] 362 363 saw_baseline_once = False 364 365 for ((baseline_label_list, baseline_np_data_2d), (rest_label_list, rest_np_data_2d)) in nested_it: 366 _debug_print("baseline_label_list:", baseline_label_list) 367 _debug_print("baseline_np_data_2d:", baseline_np_data_2d) 368 _debug_print("rest_label_list:", rest_label_list) 369 _debug_print("rest_np_data_2d:", rest_np_data_2d) 370 371 mean_diff = baseline_np_data_2d.mean(axis=0) - rest_np_data_2d.mean(axis=0) 372 # 2-sample 2-tailed t-test with delta=0 373 # e.g. "Is it true that usually the two sample means are different?" 374 t_statistic, t_pvalue = sc.ttest_ind(baseline_np_data_2d, rest_np_data_2d, axis=0) 375 376 # 2-sample 1-tailed t-test with delta=50 377 # e.g. "Is it true that usually the sample means better than 50ms?" 378 t2 = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta) 379 p2 = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta) 380 381 t2_b = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2) 382 p2_b = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2) 383 384 print("%s vs %s" %(rest_label_list, baseline_value)) 385 print(" ", data_header) 386 print("Mean Difference: ", mean_diff) 387 print("T-test (2-tailed) != 0: t=%s, p=%s" %(t_statistic, t_pvalue)) 388 print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA, t2, p2)) 389 print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA2, t2_b, p2_b)) 390 391 def write_out_values(label_list, *args): 392 csv_writer.writerow(label_list + [i[_PLOT_DATA_INDEX] for i in args]) 393 394 sem = stats_standard_error(baseline_np_data_2d, rest_np_data_2d, axis=0) 395 if saw_baseline_once == False: 396 saw_baseline_once = True 397 base_sem = stats_standard_error_one(baseline_np_data_2d, axis=0) 398 write_out_values(baseline_label_list, baseline_np_data_2d.mean(axis=0), [0], base_sem, [None], [None], [None]) 399 write_out_values(rest_label_list, rest_np_data_2d.mean(axis=0), mean_diff, sem, t_pvalue, p2, p2_b) 400 401 # now do the global statistics aggregation 402 403 if rest_label_list[filter_index] == filter_value: 404 continue 405 406 if mean_diff > delta: 407 better_than_delta.append((mean_diff, p2, rest_label_list)) 408 409 if p2 <= pvalue_threshold: 410 better_than_delta_p95.append((mean_diff, rest_label_list)) 411 412 if better_than_delta: 413 global_stats['better_than_delta'].append(better_than_delta) 414 if better_than_delta_p95: 415 global_stats['better_than_delta_p95'].append(better_than_delta_p95) 416 417 print("------------------------") 418 print("Global statistics:") 419 print("//// Rows with %s=%s are ignored here." %_IGNORE_PAIR) 420 print("- # of results with mean diff better than delta(%d) = %d" %(delta, len(global_stats['better_than_delta']))) 421 print(" > (meandiff, pvalue, labels)") 422 for i in global_stats['better_than_delta']: 423 print(" > %s" %i) 424 print("- # of results with mean diff better than delta(%d) CI%d%% = %d" %(delta, ci_threshold, len(global_stats['better_than_delta_p95']))) 425 print(" > (meandiff, labels)") 426 for i in global_stats['better_than_delta_p95']: 427 print(" > %s" %i) 428 429def main(): 430 global _debug 431 global _DELTA 432 global _PVALUE_THRESHOLD 433 434 opts = parse_options() 435 _debug = opts.debug 436 _debug_print("parsed options: ", opts) 437 438 _PVALUE_THRESHOLD = opts.pvalue_threshold or _PVALUE_THRESHOLD 439 440 for file_name in opts.input_files: 441 with open(file_name, 'r') as input_file: 442 (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file) 443 print_analysis(grouped_numpy_iter, label_header, data_header, opts.output_samples) 444 445 with open(file_name, 'r') as input_file: 446 (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file) 447 without_baseline_iter = group_by_without_baseline_key(grouped_numpy_iter, label_header) 448 #_debug_print_gen(without_baseline_iter) 449 450 comparable_metrics_iter = iterate_comparable_metrics(without_baseline_iter, label_header) 451 print_comparable_analysis(comparable_metrics_iter, label_header, data_header, opts.output_comparable, opts.output_comparable_significant) 452 453 return 0 454 455 456if __name__ == '__main__': 457 sys.exit(main()) 458