# Machine learning for MagellanMapper
# Author: David Young, 2017, 2019
"""Machine learning and output for MagellanMapper.
"""
from collections import OrderedDict
from enum import Enum
from typing import Any, Callable, Dict, Sequence, Tuple
import numpy as np
import pandas as pd
from magmap.settings import config
from magmap.io import libmag
from magmap.io import df_io
[docs]
class GridSearchStats(Enum):
"""Grid search statistics categories."""
PARAM = "Par" # (hyper)parameter
PPV = "PPV" # positive predictive value
SENS = "Sens" # sensitivity
POS = "Pos" # condition positive
TP = "TP" # true positive
FP = "FP" # false positive
TN = "TN" # true negative
FN = "FN" # false negative
FDR = "FDR" # false discovery rate
[docs]
def grid_search(
hyperparams: "OrderedDict[str, Sequence[float]]",
fnc: Callable[[Any], Tuple[Any, Sequence]],
*fnc_args
) -> "OrderedDict[str, Tuple[Sequence, Sequence, str, OrderedDict]]":
"""Perform a grid search for hyperparameter optimization.
A separate grid search will be performed for each item in ``roc_dict``.
Note that currently each subsequent grid search will use the last
settings from the prior search.
Args:
hyperparams: Ordered dictionary with sequences of the format:
``(:class:`profiles.ROIProfile` parameter, (a, b, c, ...))``.
fnc: Function to call during the grid search, which must
return ``stats, summaries``.
*fnc_args: Arguments to pass to ``fnc``.
Returns:
Dictionary of stats suitable for parsing in :meth:`parse_grid_stats`.
"""
# gets the ROC settings
settings = config.roi_profile
file_summaries = []
iterable_keys = [] # hyperparameters to iterate through
iterable_dict = OrderedDict() # group results
for key2, value2 in hyperparams.items():
if np.isscalar(value2):
# set scalar values rather than iterating and processing
settings[key2] = value2
print("changed {} to {}".format(key2, value2))
else:
print("adding iterable setting {}".format(key2))
iterable_keys.append(key2)
def grid_iterate(i, grid_dict, name, parent_params):
key = iterable_keys[i]
name = key if name is None else name + "-" + key
print("name: {}".format(name))
if i < len(iterable_keys) - 1:
name += "("
for j in grid_dict[key]:
settings[key] = j
# track parents and their values for given run
parent_params = parent_params.copy()
parent_params[key] = j
paren_i = name.rfind("(")
if paren_i != -1:
name = name[:paren_i]
if libmag.is_number(j):
name += "({:.3g})".format(j)
else:
name += " {}".format(j)
grid_iterate(
i + 1, grid_dict, name, parent_params)
else:
# process each value in parameter array
stats = []
last_param_vals = grid_dict[key]
for param in last_param_vals:
print("===============================================\n"
"Grid search hyperparameters {} for {}"
.format(name, libmag.format_num(param, 3)))
settings[key] = param
stat, summaries = fnc(*fnc_args)
stats.append(stat)
file_summaries.extend(summaries)
iterable_dict[name] = (
stats, last_param_vals, key, parent_params)
grid_iterate(0, hyperparams, None, OrderedDict())
# summary of each file collected together
for summary in file_summaries:
print(summary)
return iterable_dict
[docs]
def parse_grid_stats(
stats: "OrderedDict[str, Tuple[Sequence, Sequence, str, OrderedDict]]"
) -> Tuple[Dict[str, Tuple[Sequence, Sequence, Sequence]], pd.DataFrame]:
"""Parse stats from a grid search.
Args:
stats: Dictionary where key is a string with the parameters
up to the last parameter group, and each value is a tuple of
the raw stats as (pos, true_pos, false_pos); the array of
values for the last parameter; the last parameter key; and an
``OrderedDict`` of the parent parameters and their values for
the given set of stats.
Returns:
Tuple of ``group_stats`` and ``df``:
- ``group_stats`` is a dictionary of stats, where keys
correspond go ``stats`` keys, and values are tuples of the
false discovery rate, sensitivity, and last parameter group value,
each as sequences
- ``df`` is a data frame summarizing the stats
"""
# parse a grid search
stats_for_df = {}
headers = None
group_dict = {}
param_keys = []
for key, value in stats.items():
# parse stats from a set of parameters
grid_stats = np.array(value[0]) # raw stats
# last parameter is given separately since it is actively varying
last_param_vals, last_param_key, parent_params = value[1:]
if not headers:
# set up headers for each stat and insert parameter headers
# at the start
headers = [
GridSearchStats.PARAM.value,
GridSearchStats.PPV,
GridSearchStats.SENS,
GridSearchStats.POS,
GridSearchStats.TP,
GridSearchStats.FP,
GridSearchStats.FDR,
]
headers[0] = "_".join((headers[0], last_param_key))
for i, parent in enumerate(parent_params.keys()):
headers.insert(
i, "_".join((GridSearchStats.PARAM.value, parent)))
param_keys.append(parent)
param_keys.append(last_param_key)
# false discovery rate, inverse of PPV, since don't have true negs
fdr = np.subtract(
1, np.divide(grid_stats[:, 1],
np.add(grid_stats[:, 1], grid_stats[:, 2])))
sens = np.divide(grid_stats[:, 1], grid_stats[:, 0])
for i, n in enumerate(last_param_vals):
stat_list = []
for parent_val in parent_params.values():
stat_list.append(parent_val)
stat_list.extend(
(last_param_vals[i], 1 - fdr[i], sens[i],
*grid_stats[i].astype(int), fdr[i]))
for header, stat in zip(headers, stat_list):
stats_for_df.setdefault(header, []).append(stat)
group_dict[key] = (fdr, sens, last_param_vals)
print()
# generate a data frame to summarize stats and save to file
path_df = libmag.make_out_path(
"gridsearch_{}.csv".format("_".join(param_keys)))
df = df_io.dict_to_data_frame(stats_for_df, path_df, show=" ")
return group_dict, df
if __name__ == "__main__":
print("MagellanMapper machine learning")