Source code for useful_rdkit_utils.stat_utils

import math
from typing import Callable, List, Tuple

import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import pearsonr
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample



[docs]
def bootstrap_confidence_interval(truth: List[float], pred: List[float],
                                  stat_function: Callable[[List[float], List[float]], float],
                                  num_iterations: int = 1000, interval: float = 95.0) -> Tuple[float, float, float]:
    """ Calculate a 95% confidence interval (CI) for a statistic of interest using bootstrap

    :param truth: the true values
    :param pred: the predicted values
    :param stat_function: the function to calculate the statistic of interest, should return a single value
    :param num_iterations: number of bootstrap iterations
    :param interval: the confidence interval to calculate
    :return: 95% CI lower bound, value of the statistic, 95% CI upper bound
    """
    lb = (100.0 - interval) / 2.0
    ub = 100.0 - lb

    result_df = pd.DataFrame({"truth": truth, "pred": pred})
    stat_val = stat_function(truth, pred)
    stat_list = []
    for _ in range(0, num_iterations):
        sample_df = resample(result_df)
        stat_list.append(roc_auc_score(sample_df.truth, sample_df.pred))
    return np.percentile(stat_list, lb), stat_val, np.percentile(stat_list, ub)




[docs]
def pearson_confidence(r: int, num: int, interval: float = 0.95) -> Tuple[float, float]:
    """
    Calculate upper and lower 95% CI for a Pearson r (not R**2)
    Inspired by https://stats.stackexchange.com/questions/18887

    :param r: Pearson's R
    :param num: number of data points
    :param interval: confidence interval (0-1.0)
    :return: lower bound, upper bound
    """
    stderr = 1.0 / math.sqrt(num - 3)
    z_score = norm.ppf(interval)
    delta = z_score * stderr
    lower = math.tanh(math.atanh(r) - delta)
    upper = math.tanh(math.atanh(r) + delta)
    return lower, upper




[docs]
def max_possible_correlation(vals: List[float], error: float = 1 / 3.0,
                             method: Callable[[List[float], List[float]], float] = pearsonr,
                             cycles: int = 1000) -> float:
    """
    Calculate the maximum possible correlation given a particular experimental error
    Based on Brown, Muchmore, Hajduk http://www.sciencedirect.com/science/article/pii/S1359644609000403
    :param vals: experimental values (should be on a log scale)
    :param error: experimental error
    :param method: method for calculating the correlation, must take 2 lists and return correlation and p_value
    :param cycles: number of random cycles
    :return: maximum possible correlation
    """
    cor_list = []
    for i in range(0, cycles):
        noisy_vals = []
        for val in vals:
            noisy_vals.append(val + np.random.normal(0, error))
        cor_list.append(method(vals, noisy_vals)[0])
    return np.mean(cor_list)