Source code for useful_rdkit_utils.stat_utils

import math
from typing import Callable, List, Tuple

import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import pearsonr
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample


[docs] def bootstrap_confidence_interval(truth: List[float], pred: List[float], stat_function: Callable[[List[float], List[float]], float], num_iterations: int = 1000, interval: float = 95.0) -> Tuple[float, float, float]: """ Calculate a 95% confidence interval (CI) for a statistic of interest using bootstrap :param truth: the true values :param pred: the predicted values :param stat_function: the function to calculate the statistic of interest, should return a single value :param num_iterations: number of bootstrap iterations :param interval: the confidence interval to calculate :return: 95% CI lower bound, value of the statistic, 95% CI upper bound """ lb = (100.0 - interval) / 2.0 ub = 100.0 - lb result_df = pd.DataFrame({"truth": truth, "pred": pred}) stat_val = stat_function(truth, pred) stat_list = [] for _ in range(0, num_iterations): sample_df = resample(result_df) stat_list.append(roc_auc_score(sample_df.truth, sample_df.pred)) return np.percentile(stat_list, lb), stat_val, np.percentile(stat_list, ub)
[docs] def pearson_confidence(r: int, num: int, interval: float = 0.95) -> Tuple[float, float]: """ Calculate upper and lower 95% CI for a Pearson r (not R**2) Inspired by https://stats.stackexchange.com/questions/18887 :param r: Pearson's R :param num: number of data points :param interval: confidence interval (0-1.0) :return: lower bound, upper bound """ stderr = 1.0 / math.sqrt(num - 3) z_score = norm.ppf(interval) delta = z_score * stderr lower = math.tanh(math.atanh(r) - delta) upper = math.tanh(math.atanh(r) + delta) return lower, upper
[docs] def max_possible_correlation(vals: List[float], error: float = 1 / 3.0, method: Callable[[List[float], List[float]], float] = pearsonr, cycles: int = 1000) -> float: """ Calculate the maximum possible correlation given a particular experimental error Based on Brown, Muchmore, Hajduk http://www.sciencedirect.com/science/article/pii/S1359644609000403 :param vals: experimental values (should be on a log scale) :param error: experimental error :param method: method for calculating the correlation, must take 2 lists and return correlation and p_value :param cycles: number of random cycles :return: maximum possible correlation """ cor_list = [] for i in range(0, cycles): noisy_vals = [] for val in vals: noisy_vals.append(val + np.random.normal(0, error)) cor_list.append(method(vals, noisy_vals)[0]) return np.mean(cor_list)