#!/usr/bin/env python3 # Copyright 2021 The Chromium Authors # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import argparse import logging import os import pandas as pd import numpy from scipy import stats as scipy_stats def get_diamond_string(diamond_count: int): if diamond_count == 0: return "~" elif diamond_count == 1: return "◆" elif diamond_count == 2: return "◆◆" elif diamond_count == 3: return "◆◆◆" elif diamond_count == 4: return "◆◆◆◆" def get_diamonds_count(significance: pd.DataFrame): """ This function emulates the "diamond" significance representation that is familiar to UMA users. """ assert (not (significance > 1).any().any()) assert (not (significance < 0).any().any()) # Avoid log10(0) which is undefined. significance = numpy.clip(significance, 0, 0.999999) # scipy_stats.norm.cdf(1.96) = 0.975 and we're interested in 2 tail # test. 1.96 gives a 0.05 p-value. Multiply by 2 here to correct. p_value = (1 - significance) * 2 # floor() to avoid exaggerating results and to round. # absolute() to make the result positive. log_p_value = numpy.floor(numpy.absolute(numpy.log10(p_value))) # Clip because 4 diamond is the max no matter the p-value. return numpy.clip(log_p_value, 0, 4) def compute_mean_and_stderr(summary_path: str): df = pd.read_csv(summary_path) # skipna because no line has as all measurements. This is because of the # different sampling rates of the data sources in power_sampler # and power_metrics. means = df.mean(skipna=True) # Calculate the standard error of each column. stderrs = df.std(skipna=True) / numpy.sqrt(df.count()) stats = means.to_frame().join(stderrs.to_frame(), lsuffix='mean', rsuffix='stderr') stats = stats.rename(columns={"0mean": "mean", "0stderr": "stderr"}) return stats def percent_difference(first_value: pd.DataFrame, second_value: pd.DataFrame): """ Returns the comparative percentage difference between two values/columns. The result is to be read as : |second_value| is X% smaller/larger than |first_value|. Ex: percent_difference(20, 10) --> -50 Ex: percent_difference(10, 50) --> 500 """ return ((second_value - first_value) / first_value) * 100 def compare(data_dir: str, baseline_summary: str, alternative_summary: str): """Open two summary files and compare their values. Saves the results in data_dir. Args: data_dir: The directory to save the comparison csv in. baseline_summary: summary.csv for the baseline. alternative_summary: summary.csv for the comparison. """ # Get names of the browsers being compared from the paths. baseline_name = os.path.basename( os.path.dirname(baseline_summary)).split("_")[0] alternative_name = os.path.basename( os.path.dirname(alternative_summary)).split("_")[0] all_stats = [] # Extract mean and std values for each column of |summary| into a new # dataframe. baseline_stats = compute_mean_and_stderr(baseline_summary) alternative_stats = compute_mean_and_stderr(alternative_summary) # Join the calculated values for both browsers into a single dataframe. comparison_summary = baseline_stats.join(alternative_stats, lsuffix=f"_{baseline_name}", rsuffix=f"_{alternative_name}") # Calculate the difference in percent between the baseline and comparison. comparison_summary["difference"] = percent_difference( baseline_stats["mean"], alternative_stats["mean"]) # See https://www.cliffsnotes.com/study-guides/statistics/univariate-inferential-tests/two-sample-z-test-for-comparing-two-means comparison_summary["z_score"] = (baseline_stats["mean"] - alternative_stats["mean"]) / numpy.sqrt( pow(baseline_stats["stderr"], 2) + pow(alternative_stats["stderr"], 2)) # See https://machinelearningmastery.com/critical-values-for-statistical-hypothesis-testing/ comparison_summary["significance_level"] = scipy_stats.norm.cdf( abs(comparison_summary["z_score"])) diamond_count = get_diamonds_count(comparison_summary["significance_level"]) comparison_summary["diamonds"] = diamond_count.apply(get_diamond_string) # Drop results for which comparing the mean makes no sense. comparison_summary = comparison_summary.drop([ 'battery_max_capacity', 'battery_current_capacity', 'sample_time', 'elapsed_ns' ]) # Display and save results. logging.info(comparison_summary) comparison_summary.to_csv(f"{data_dir}/comparison_summary.csv") def main(): parser = argparse.ArgumentParser( description='Compares two summary files for analysis.') parser.add_argument("--output_dir", help="Directory where to write the comparison file.", required=True) parser.add_argument("--baseline_dir", help="Directory containing the baseline benchmark data.", required=True) parser.add_argument( "--alternative_dir", help="Directory containing the alternative benchmark data.", required=True) parser.add_argument('--verbose', action='store_true', help='Print verbose output.') args = parser.parse_args() if args.verbose: log_level = logging.DEBUG else: log_level = logging.INFO logging.basicConfig(format='%(levelname)s: %(message)s', level=log_level) baseline_summary_path = os.path.join(args.baseline_dir, "summary.csv") alternative_summary_path = os.path.join(args.alternative_dir, "summary.csv") summaries = [baseline_summary_path, alternative_summary_path] for summary in summaries: if not os.path.isfile(summary): logging.error(f"summary.csv missing in {summary}.") sys.exit(-1) compare(args.output_dir, summaries[0], summaries[1]) if __name__ == "__main__": # Avoid scientific notation when printing numbers. pd.options.display.float_format = '{:.6f}'.format main()