Source code for magmap.stats.mlearn

# Machine learning for MagellanMapper
# Author: David Young, 2017, 2019
"""Machine learning and output for MagellanMapper.
"""

from collections import OrderedDict
from enum import Enum
from typing import Any, Callable, Dict, Sequence, Tuple

import numpy as np
import pandas as pd

from magmap.settings import config
from magmap.io import libmag
from magmap.io import df_io


[docs] class GridSearchStats(Enum): """Grid search statistics categories.""" PARAM = "Par" # (hyper)parameter PPV = "PPV" # positive predictive value SENS = "Sens" # sensitivity POS = "Pos" # condition positive TP = "TP" # true positive FP = "FP" # false positive TN = "TN" # true negative FN = "FN" # false negative FDR = "FDR" # false discovery rate
[docs] def parse_grid_stats( stats: "OrderedDict[str, Tuple[Sequence, Sequence, str, OrderedDict]]" ) -> Tuple[Dict[str, Tuple[Sequence, Sequence, Sequence]], pd.DataFrame]: """Parse stats from a grid search. Args: stats: Dictionary where key is a string with the parameters up to the last parameter group, and each value is a tuple of the raw stats as (pos, true_pos, false_pos); the array of values for the last parameter; the last parameter key; and an ``OrderedDict`` of the parent parameters and their values for the given set of stats. Returns: Tuple of ``group_stats`` and ``df``: - ``group_stats`` is a dictionary of stats, where keys correspond go ``stats`` keys, and values are tuples of the false discovery rate, sensitivity, and last parameter group value, each as sequences - ``df`` is a data frame summarizing the stats """ # parse a grid search stats_for_df = {} headers = None group_dict = {} param_keys = [] for key, value in stats.items(): # parse stats from a set of parameters grid_stats = np.array(value[0]) # raw stats # last parameter is given separately since it is actively varying last_param_vals, last_param_key, parent_params = value[1:] if not headers: # set up headers for each stat and insert parameter headers # at the start headers = [ GridSearchStats.PARAM.value, GridSearchStats.PPV, GridSearchStats.SENS, GridSearchStats.POS, GridSearchStats.TP, GridSearchStats.FP, GridSearchStats.FDR, ] headers[0] = "_".join((headers[0], last_param_key)) for i, parent in enumerate(parent_params.keys()): headers.insert( i, "_".join((GridSearchStats.PARAM.value, parent))) param_keys.append(parent) param_keys.append(last_param_key) # false discovery rate, inverse of PPV, since don't have true negs fdr = np.subtract( 1, np.divide(grid_stats[:, 1], np.add(grid_stats[:, 1], grid_stats[:, 2]))) sens = np.divide(grid_stats[:, 1], grid_stats[:, 0]) for i, n in enumerate(last_param_vals): stat_list = [] for parent_val in parent_params.values(): stat_list.append(parent_val) stat_list.extend( (last_param_vals[i], 1 - fdr[i], sens[i], *grid_stats[i].astype(int), fdr[i])) for header, stat in zip(headers, stat_list): stats_for_df.setdefault(header, []).append(stat) group_dict[key] = (fdr, sens, last_param_vals) print() # generate a data frame to summarize stats and save to file path_df = libmag.make_out_path( "gridsearch_{}.csv".format("_".join(param_keys))) df = df_io.dict_to_data_frame(stats_for_df, path_df, show=" ") return group_dict, df
if __name__ == "__main__": print("MagellanMapper machine learning")