1#!/usr/bin/env python3
2#
3# Copyright 2018, The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#     http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17"""
18Perform statistical analysis on measurements produced by app_startup_runner.py
19
20Install:
21$> sudo apt-get install python3-scipy
22
23Usage:
24$> ./analyze_metrics.py <filename.csv> [<filename2.csv> ...]
25$> ./analyze_metrics.py --help
26"""
27
28import argparse
29import csv
30import itertools
31import os
32import subprocess
33import sys
34import tempfile
35from typing import Any, List, Dict, Iterable, TextIO, Tuple
36
37from scipy import stats as sc
38import numpy as np
39
40
41# These CSV columns are considered labels. Everything after them in the same row are metrics.
42_LABEL_COLUMNS=['packages', 'readaheads', 'compiler_filters']
43# The metric series with the 'cold' readahead is the baseline.
44# All others (warm, jit, etc) are the potential improvements.
45
46#fixme: this should probably be an option
47_BASELINE=('readaheads', 'cold')
48# ignore this for some statistic calculations
49_IGNORE_PAIR=('readaheads', 'warm')
50_PLOT_SUBKEY='readaheads'
51_PLOT_GROUPKEY='packages'
52_PLOT_DATA_INDEX = 0
53_DELTA=50
54_DELTA2=100
55_PVALUE_THRESHOLD=0.10
56_debug = False  # See -d/--debug flag.
57
58def parse_options(argv: List[str] = None):
59  """Parse command line arguments and return an argparse Namespace object."""
60  parser = argparse.ArgumentParser(description="Perform statistical analysis on measurements produced by app_start_runner.py.")
61  parser.add_argument('input_files', metavar='file.csv', nargs='+', help='CSV file produced by app_startup_runner.py')
62
63  parser.add_argument('-d', '--debug', dest='debug', action='store_true', help='Add extra debugging output')
64  parser.add_argument('-os', '--output-samples', dest='output_samples', default='/dev/null', action='store', help='Store CSV for per-sample data')
65  parser.add_argument('-oc', '--output-comparable', dest='output_comparable', default='/dev/null', action='store', help='Output CSV for comparable against baseline')
66  parser.add_argument('-ocs', '--output-comparable-significant', dest='output_comparable_significant', default='/dev/null', action='store', help='Output CSV for comparable against baseline (significant only)')
67  parser.add_argument('-pt', '--pvalue-threshold', dest='pvalue_threshold', type=float, default=_PVALUE_THRESHOLD, action='store')
68  parser.add_argument('-dt', '--delta-threshold', dest='delta_threshold', type=int, default=_DELTA, action='store')
69
70  return parser.parse_args(argv)
71
72def _debug_print(*args, **kwargs):
73  """Print the args to sys.stderr if the --debug/-d flag was passed in."""
74  global _debug
75  if _debug:
76    print(*args, **kwargs, file=sys.stderr)
77
78def _expand_gen_repr(args):
79  new_args_list = []
80  for i in args:
81    # detect iterable objects that do not have their own override of __str__
82    if hasattr(i, '__iter__'):
83      to_str = getattr(i, '__str__')
84      if to_str.__objclass__ == object:
85        # the repr for a generator is just type+address, expand it out instead.
86        new_args_list.append([_expand_gen_repr([j])[0] for j in i])
87        continue
88    # normal case: uses the built-in to-string
89    new_args_list.append(i)
90  return new_args_list
91
92def _debug_print_gen(*args, **kwargs):
93  """Like _debug_print but will turn any iterable args into a list."""
94  if not _debug:
95    return
96
97  new_args_list = _expand_gen_repr(args)
98  _debug_print(*new_args_list, **kwargs)
99
100def read_headers(input_file: TextIO) -> Tuple[List[str], List[str]]:
101  _debug_print("read_headers for file: ", input_file.name)
102  csv_reader = csv.reader(input_file)
103
104  label_num_columns = len(_LABEL_COLUMNS)
105
106  try:
107    header = next(csv_reader)
108  except StopIteration:
109    header = None
110  _debug_print('header', header)
111
112  if not header:
113    return (None, None)
114
115  labels = header[0:label_num_columns]
116  data = header[label_num_columns:]
117
118  return (labels, data)
119
120def read_labels_and_data(input_file: TextIO) -> Iterable[Tuple[List[str], List[int]]]:
121  _debug_print("print_analysis for file: ", input_file.name)
122  csv_reader = csv.reader(input_file)
123
124  # Skip the header because it doesn't contain any data.
125  # To get the header see read_headers function.
126  try:
127    header = next(csv_reader)
128  except StopIteration:
129    header = None
130
131  label_num_columns = len(_LABEL_COLUMNS)
132
133  for row in csv_reader:
134    if len(row) > 0 and row[0][0] == ';':
135      _debug_print("skip comment line", row)
136      continue
137
138    labels = row[0:label_num_columns]
139    data = [int(i) for i in row[label_num_columns:]]
140
141#    _debug_print("labels:", labels)
142#    _debug_print("data:", data)
143
144    yield (labels, data)
145
146def group_metrics_by_label(it: Iterable[Tuple[List[str], List[int]]]):
147  prev_labels = None
148  data_2d = []
149
150  for label_list, data_list in it:
151    if prev_labels != label_list:
152      if prev_labels:
153#        _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
154        yield (prev_labels, data_2d)
155      data_2d = []
156
157    data_2d.append(data_list)
158    prev_labels = label_list
159
160  if prev_labels:
161#    _debug_print("grouped labels:", prev_labels, "data_2d:", data_2d)
162    yield (prev_labels, data_2d)
163
164def data_to_numpy(it: Iterable[Tuple[List[str], List[List[int]]]]) -> Iterable[Tuple[List[str], Any]]:
165  for label_list, data_2d in it:
166    yield (label_list, np.asarray(data_2d, dtype=int))
167
168def iterate_columns(np_data_2d):
169  for col in range(np_data_2d.shape[1]):
170    col_as_array = np_data_2d[:, col]
171    yield col_as_array
172
173def confidence_interval(np_data_2d, percent=0.95):
174  """
175  Given some data [[a,b,c],[d,e,f,]...]
176
177  We assume the same metric is in the column (e.g. [a,d])
178  and that data in the rows (e.g. [b,e]) are separate metric values.
179
180  We then calculate the CI for each metric individually returning it as a list of tuples.
181  """
182  arr = []
183  for col_2d in iterate_columns(np_data_2d):
184    mean = col_2d.mean()
185    sigma = col_2d.std()
186
187    ci = sc.norm.interval(percent, loc=mean, scale=sigma / np.sqrt(len(col_2d)))
188    arr.append(ci)
189
190  # TODO: This seems to be returning NaN when all the samples have the same exact value
191  # (e.g. stddev=0, which can trivially happen when sample count = 1).
192
193  return arr
194
195def print_analysis(it, label_header: List[str], data_header: List[str], output_samples: str):
196  print(label_header)
197
198  with open(output_samples, "w") as output_file:
199
200    csv_writer = csv.writer(output_file)
201    csv_writer.writerow(label_header + ['mean', 'std', 'confidence_interval_a', 'confidence_interval_b'])
202
203    for label_list, np_data_2d in it:
204      print("**********************")
205      print(label_list)
206      print()
207      print("      ", data_header)
208      # aggregate computation column-wise
209      print("Mean: ", np_data_2d.mean(axis=0))
210      print("Std:  ", np_data_2d.std(axis=0))
211      print("CI95%:", confidence_interval(np_data_2d))
212      print("SEM:  ", stats_standard_error_one(np_data_2d, axis=0))
213
214      #ci = confidence_interval(np_data_2d)[_PLOT_DATA_INDEX]
215      sem = stats_standard_error_one(np_data_2d, axis=0)[_PLOT_DATA_INDEX]
216      mean = np_data_2d.mean(axis=0)[_PLOT_DATA_INDEX]
217
218      ci = (mean - sem, mean + sem)
219
220      csv_writer.writerow(label_list + [mean, np_data_2d.std(axis=0)[_PLOT_DATA_INDEX], ci[0], ci[1]])
221
222def from_file_group_by_labels(input_file):
223  (label_header, data_header) = read_headers(input_file)
224  label_data_iter = read_labels_and_data(input_file)
225  grouped_iter = group_metrics_by_label(label_data_iter)
226  grouped_numpy_iter = data_to_numpy(grouped_iter)
227
228  return grouped_numpy_iter, label_header, data_header
229
230def list_without_index(list, index):
231  return list[:index] + list[index+1:]
232
233def group_by_without_baseline_key(grouped_numpy_iter, label_header):
234  """
235  Data is considered comparable if the only difference is the baseline key
236  (i.e. the readahead is different but the package, compilation filter, etc, are the same).
237
238  Returns iterator that's grouped by the non-baseline labels to an iterator of
239  (label_list, data_2d).
240  """
241  baseline_index = label_header.index(_BASELINE[0])
242
243  def get_label_without_baseline(tpl):
244    label_list, _ = tpl
245    return list_without_index(label_list, baseline_index)
246  # [['pkgname', 'compfilter', 'warm'], [data]]
247  # [['pkgname', 'compfilter', 'cold'], [data2]]
248  # [['pkgname2', 'compfilter', 'warm'], [data3]]
249  #
250  #   ->
251  # ( [['pkgname', 'compfilter', 'warm'], [data]]      # ignore baseline label change.
252  #   [['pkgname', 'compfilter', 'cold'], [data2]] ),  # split here because the pkgname changed.
253  # ( [['pkgname2', 'compfilter', 'warm'], [data3]] )
254  for group_info, it in itertools.groupby(grouped_numpy_iter, key = get_label_without_baseline):
255    yield it
256
257  # TODO: replace this messy manual iteration/grouping with pandas
258
259def iterate_comparable_metrics(without_baseline_iter, label_header):
260  baseline_index = label_header.index(_BASELINE[0])
261  baseline_value = _BASELINE[1]
262
263  _debug_print("iterate comparables")
264
265  def is_baseline_fun(tp):
266    ll, dat = tp
267    return ll[baseline_index] == baseline_value
268
269  # iterating here when everything but the baseline key is the same.
270  for it in without_baseline_iter:
271    it1, it2 = itertools.tee(it)
272
273    # find all the baseline data.
274    baseline_filter_it = filter(is_baseline_fun, it1)
275
276    # find non-baseline data.
277    nonbaseline_filter_it = itertools.filterfalse(is_baseline_fun, it2)
278
279    yield itertools.product(baseline_filter_it, nonbaseline_filter_it)
280
281def stats_standard_error_one(a, axis):
282  a_std = a.std(axis=axis, ddof=0)
283  a_len = a.shape[axis]
284
285  return a_std / np.sqrt(a_len)
286
287def stats_standard_error(a, b, axis):
288  a_std = a.std(axis=axis, ddof=0)
289  b_std = b.std(axis=axis, ddof=0)
290
291  a_len = a.shape[axis]
292  b_len = b.shape[axis]
293
294  temp1 = a_std*a_std/a_len
295  temp2 = b_std*b_std/b_len
296
297  return np.sqrt(temp1 + temp2)
298
299def stats_tvalue(a, b, axis, delta = 0):
300  a_mean = a.mean(axis=axis)
301  b_mean = b.mean(axis=axis)
302
303  return (a_mean - b_mean - delta) / stats_standard_error(a, b, axis)
304
305def stats_pvalue(a, b, axis, delta, left:bool = False):
306  """
307  Single-tailed 2-sample t-test.
308
309  Returns p-value for the null hypothesis: mean(a) - mean(b) >= delta.
310  :param a: numpy 2d array
311  :param b: numpy 2d array
312  :param axis: which axis to do the calculations across
313  :param delta: test value of mean differences
314  :param left: if true then use <= delta instead of >= delta
315  :return: p-value
316  """
317  # implement our own pvalue calculation because the built-in t-test (t,p values)
318  # only offer delta=0 , e.g. m1-m1 ? 0
319  # we are however interested in m1-m2 ? delta
320  t_value = stats_tvalue(a, b, axis, delta)
321
322  # 2-sample degrees of freedom is using the array sizes - 2.
323  dof = a.shape[axis] + b.shape[axis] - 2
324
325  if left:
326    # left tailed test. e.g. m1-m2 <= delta
327    return sc.t.cdf(t_value, dof)
328  else:
329    # right tailed test. e.g. m1-m2 >= delta
330    return sc.t.sf(t_value, dof)
331  # a left+right tailed test is a 2-tail t-test and can be done using ttest_ind for delta=0
332
333def print_comparable_analysis(comparable_metrics_iter, label_header, data_header, output_comparable: str, output_comparable_significant: str):
334  baseline_value = _BASELINE[1]
335  baseline_index = label_header.index(_BASELINE[0])
336
337  old_baseline_label_list = None
338  delta = _DELTA
339  filter_value = _IGNORE_PAIR[1]
340  filter_index = label_header.index(_IGNORE_PAIR[0])
341
342  pvalue_threshold = _PVALUE_THRESHOLD
343  ci_threshold = (1 - _PVALUE_THRESHOLD) * 100.0
344
345  with open(output_comparable, "w") as output_file:
346
347    csv_writer = csv.writer(output_file)
348    csv_writer.writerow(label_header + ['mean', 'mean_diff', 'sem', 'pvalue_2tailed', 'pvalue_gt%d' %(_DELTA), 'pvalue_gt%d' %(_DELTA2)])
349
350    print("------------------------------------------------------------------")
351    print("Comparison against the baseline %s = %s" %(_BASELINE, baseline_value))
352    print("--- Right-tailed t-test checks if the baseline >= current %s by at least %d" %(_BASELINE[0], delta))
353    print()
354
355    global_stats = {'better_than_delta': [], 'better_than_delta_p95': []}
356
357    for nested_it in comparable_metrics_iter:
358      print("************************")
359
360      better_than_delta = []
361      better_than_delta_p95 = []
362
363      saw_baseline_once = False
364
365      for ((baseline_label_list, baseline_np_data_2d), (rest_label_list, rest_np_data_2d)) in nested_it:
366        _debug_print("baseline_label_list:", baseline_label_list)
367        _debug_print("baseline_np_data_2d:", baseline_np_data_2d)
368        _debug_print("rest_label_list:", rest_label_list)
369        _debug_print("rest_np_data_2d:", rest_np_data_2d)
370
371        mean_diff = baseline_np_data_2d.mean(axis=0) - rest_np_data_2d.mean(axis=0)
372        # 2-sample 2-tailed t-test with delta=0
373        # e.g. "Is it true that usually the two sample means are different?"
374        t_statistic, t_pvalue = sc.ttest_ind(baseline_np_data_2d, rest_np_data_2d, axis=0)
375
376        # 2-sample 1-tailed t-test with delta=50
377        # e.g. "Is it true that usually the sample means better than 50ms?"
378        t2 = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
379        p2 = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=delta)
380
381        t2_b = stats_tvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
382        p2_b = stats_pvalue(baseline_np_data_2d, rest_np_data_2d, axis=0, delta=_DELTA2)
383
384        print("%s vs %s" %(rest_label_list, baseline_value))
385        print("                           ", data_header)
386        print("Mean Difference:           ", mean_diff)
387        print("T-test (2-tailed) != 0:      t=%s, p=%s" %(t_statistic, t_pvalue))
388        print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA, t2, p2))
389        print("T-test (right-tailed) >= %d: t=%s, p=%s" %(_DELTA2, t2_b, p2_b))
390
391        def write_out_values(label_list, *args):
392          csv_writer.writerow(label_list + [i[_PLOT_DATA_INDEX] for i in args])
393
394        sem = stats_standard_error(baseline_np_data_2d, rest_np_data_2d, axis=0)
395        if saw_baseline_once == False:
396          saw_baseline_once = True
397          base_sem = stats_standard_error_one(baseline_np_data_2d, axis=0)
398          write_out_values(baseline_label_list, baseline_np_data_2d.mean(axis=0), [0], base_sem, [None], [None], [None])
399        write_out_values(rest_label_list, rest_np_data_2d.mean(axis=0), mean_diff, sem, t_pvalue, p2, p2_b)
400
401        # now do the global statistics aggregation
402
403        if rest_label_list[filter_index] == filter_value:
404          continue
405
406        if mean_diff > delta:
407          better_than_delta.append((mean_diff, p2, rest_label_list))
408
409          if p2 <= pvalue_threshold:
410            better_than_delta_p95.append((mean_diff, rest_label_list))
411
412      if better_than_delta:
413        global_stats['better_than_delta'].append(better_than_delta)
414      if better_than_delta_p95:
415        global_stats['better_than_delta_p95'].append(better_than_delta_p95)
416
417    print("------------------------")
418    print("Global statistics:")
419    print("//// Rows with %s=%s are ignored here." %_IGNORE_PAIR)
420    print("- # of results with mean diff better than delta(%d)       = %d" %(delta, len(global_stats['better_than_delta'])))
421    print("    > (meandiff, pvalue, labels)")
422    for i in global_stats['better_than_delta']:
423      print("    > %s" %i)
424    print("- # of results with mean diff better than delta(%d) CI%d%% = %d" %(delta, ci_threshold, len(global_stats['better_than_delta_p95'])))
425    print("    > (meandiff, labels)")
426    for i in global_stats['better_than_delta_p95']:
427      print("    > %s" %i)
428
429def main():
430  global _debug
431  global _DELTA
432  global _PVALUE_THRESHOLD
433
434  opts = parse_options()
435  _debug = opts.debug
436  _debug_print("parsed options: ", opts)
437
438  _PVALUE_THRESHOLD = opts.pvalue_threshold or _PVALUE_THRESHOLD
439
440  for file_name in opts.input_files:
441    with open(file_name, 'r') as input_file:
442      (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
443      print_analysis(grouped_numpy_iter, label_header, data_header, opts.output_samples)
444
445    with open(file_name, 'r') as input_file:
446      (grouped_numpy_iter, label_header, data_header) = from_file_group_by_labels(input_file)
447      without_baseline_iter = group_by_without_baseline_key(grouped_numpy_iter, label_header)
448      #_debug_print_gen(without_baseline_iter)
449
450      comparable_metrics_iter = iterate_comparable_metrics(without_baseline_iter, label_header)
451      print_comparable_analysis(comparable_metrics_iter, label_header, data_header, opts.output_comparable, opts.output_comparable_significant)
452
453  return 0
454
455
456if __name__ == '__main__':
457  sys.exit(main())
458