Module minder_utils.evaluate.eval_utils
Expand source code
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from ..formatting.format_util import y_to_categorical
def get_scores(y_true, y_pred):
if y_true.ndim > 1:
y_true = np.argmax(y_true, axis=1)
if y_pred.ndim > 1:
y_pred = np.argmax(y_pred, axis=1)
try:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
except ValueError:
return None, None, None, None
specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)
acc = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
return sensitivity, specificity, acc, f1
def split_by_ids(X, y, patient_ids, cat=True, valid_only=True, stratify=True, seed=0):
y[y == 0] = -1
y = y.reshape(-1, )
patient_ids = patient_ids.reshape(-1, )
# make sure the train and test set got both positive and negative patients
y_p_id = []
for p_id in np.unique(patient_ids):
_y = np.unique(y[patient_ids == p_id])
rng = np.random.default_rng(seed)
y_p_id.append(int(_y[0]) if len(_y) < 2 else rng.integers(0,2))
seed += 1
y_p_id = np.array(y_p_id)
y_p_id[y_p_id < 0] = 0
train_ids, test_ids = train_test_split(np.unique(patient_ids), test_size=0.33, random_state=seed, stratify=y_p_id if stratify else None)
test_y = y[np.isin(patient_ids, test_ids)]
if valid_only:
test_filter = np.isin(test_y, [-1, 1])
else:
test_filter = np.isin(test_y, np.unique(test_y))
if cat:
return X[np.isin(patient_ids, train_ids)], y_to_categorical(y[np.isin(patient_ids, train_ids)]), \
X[np.isin(patient_ids, test_ids)][test_filter], y_to_categorical(y[np.isin(patient_ids, test_ids)][
test_filter])
return X[np.isin(patient_ids, train_ids)], y[np.isin(patient_ids, train_ids)], \
X[np.isin(patient_ids, test_ids)], y[np.isin(patient_ids, test_ids)]
class StratifiedKFoldPids:
def __init__(self, n_splits=5, shuffle=False, random_state=None):
'''
This splits the data so that no train and test
split contain the same pid. They will contain
roughly the same number of positive
and negative samples.
This is based on: It will function in the same way.
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
Arguments
---------
- ```n_splits```: ```int```, optional:
The number of splits.
Defaults to ```5```.
- ```shuffle```: ```bool```, optional:
Whether to shuffle the order of the pids before
making the splits.
Defaults to ```False```.
- ```random_state```: ```_type_```, optional:
The random state for the random processes in the class.
Defaults to ```None```.
'''
self.n_splits = n_splits
self.shuffle = shuffle
self.random_state = random_state
return
def get_n_splits(self):
'''
Returns the number of splits
Returns
--------
- ```out```: ```int``` :
The number of splits
'''
return self.n_splits
def split_by_ids(self, y, pids, seed=0):
'''
An internal function that given a set of
labels and PIDs corresponding to the labels,
this function can return the pid values that
should be assigned to the training or testing
set for each split.
Arguments
---------
- ```y```: ```array```:
Labels.
- ```pids```: ```array```:
PIDs corresponding to ```y```.
- ```seed```: ```int```, optional:
The random seed for the random processes.
Defaults to ```0```.
Returns
--------
- ```out```: ```_type_``` :
PID values that should be assigned
to the training or testing set for each split.
'''
labels = np.copy(y)
labels[labels == 0] = -1
labels = labels.reshape(-1, )
pids = pids.reshape(-1, )
# make sure the train and test set got both positive and negative patients
y_p_id = []
for p_id in np.unique(pids):
_y = np.unique(y[pids == p_id])
rng = np.random.default_rng(seed)
y_p_id.append(int(_y[0]) if len(_y) < 2 else rng.integers(0,2))
seed += 1
y_p_id = np.array(y_p_id)
y_p_id[y_p_id < 0] = 0
splitter = StratifiedKFold(n_splits=self.n_splits,
shuffle=self.shuffle,
random_state=seed if self.shuffle else None)
splits = list(splitter.split(np.unique(pids), y=y_p_id))
return [[np.unique(pids)[train_idx], np.unique(pids)[test_idx]] for train_idx, test_idx in splits]
def split(self, X, y, pids):
'''
This function produces the splits that can be used for training
and testing.
Arguments
---------
- ```X```: ```array```:
X input. This isn't used and so anything can be passed here.
- ```y```: ```array```:
The labels. This is used to stratify the data.
- ```pids```: ```_type_```:
The PIDs that is used to split the data.
Returns
--------
- ```out```: ```list``` :
List of train-test splits. This
list has length equal to ```n_splits```.
'''
rng = np.random.default_rng(self.random_state)
seed = rng.integers(0,1e6)
list_of_splits_pids = self.split_by_ids(y=y, pids=pids, seed=seed)
list_of_splits = []
for train_pids, test_pids in list_of_splits_pids:
train_idx_new = np.arange(len(pids))[np.isin(pids, train_pids)]
test_idx_new = np.arange(len(pids))[np.isin(pids, test_pids)]
list_of_splits.append([
train_idx_new,
test_idx_new,
])
return list_of_splits
Functions
def get_scores(y_true, y_pred)
-
Expand source code
def get_scores(y_true, y_pred): if y_true.ndim > 1: y_true = np.argmax(y_true, axis=1) if y_pred.ndim > 1: y_pred = np.argmax(y_pred, axis=1) try: tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() except ValueError: return None, None, None, None specificity = tn / (tn + fp) sensitivity = tp / (tp + fn) acc = accuracy_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) return sensitivity, specificity, acc, f1
def split_by_ids(X, y, patient_ids, cat=True, valid_only=True, stratify=True, seed=0)
-
Expand source code
def split_by_ids(X, y, patient_ids, cat=True, valid_only=True, stratify=True, seed=0): y[y == 0] = -1 y = y.reshape(-1, ) patient_ids = patient_ids.reshape(-1, ) # make sure the train and test set got both positive and negative patients y_p_id = [] for p_id in np.unique(patient_ids): _y = np.unique(y[patient_ids == p_id]) rng = np.random.default_rng(seed) y_p_id.append(int(_y[0]) if len(_y) < 2 else rng.integers(0,2)) seed += 1 y_p_id = np.array(y_p_id) y_p_id[y_p_id < 0] = 0 train_ids, test_ids = train_test_split(np.unique(patient_ids), test_size=0.33, random_state=seed, stratify=y_p_id if stratify else None) test_y = y[np.isin(patient_ids, test_ids)] if valid_only: test_filter = np.isin(test_y, [-1, 1]) else: test_filter = np.isin(test_y, np.unique(test_y)) if cat: return X[np.isin(patient_ids, train_ids)], y_to_categorical(y[np.isin(patient_ids, train_ids)]), \ X[np.isin(patient_ids, test_ids)][test_filter], y_to_categorical(y[np.isin(patient_ids, test_ids)][ test_filter]) return X[np.isin(patient_ids, train_ids)], y[np.isin(patient_ids, train_ids)], \ X[np.isin(patient_ids, test_ids)], y[np.isin(patient_ids, test_ids)]
Classes
class StratifiedKFoldPids (n_splits=5, shuffle=False, random_state=None)
-
This splits the data so that no train and test split contain the same pid. They will contain roughly the same number of positive and negative samples.
This is based on: It will function in the same way. https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
Arguments
-
n_splits
:int
, optional: The number of splits. Defaults to5
. -
shuffle
:bool
, optional: Whether to shuffle the order of the pids before making the splits. Defaults toFalse
. -
random_state
:_type_
, optional: The random state for the random processes in the class. Defaults toNone
.
Expand source code
class StratifiedKFoldPids: def __init__(self, n_splits=5, shuffle=False, random_state=None): ''' This splits the data so that no train and test split contain the same pid. They will contain roughly the same number of positive and negative samples. This is based on: It will function in the same way. https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html Arguments --------- - ```n_splits```: ```int```, optional: The number of splits. Defaults to ```5```. - ```shuffle```: ```bool```, optional: Whether to shuffle the order of the pids before making the splits. Defaults to ```False```. - ```random_state```: ```_type_```, optional: The random state for the random processes in the class. Defaults to ```None```. ''' self.n_splits = n_splits self.shuffle = shuffle self.random_state = random_state return def get_n_splits(self): ''' Returns the number of splits Returns -------- - ```out```: ```int``` : The number of splits ''' return self.n_splits def split_by_ids(self, y, pids, seed=0): ''' An internal function that given a set of labels and PIDs corresponding to the labels, this function can return the pid values that should be assigned to the training or testing set for each split. Arguments --------- - ```y```: ```array```: Labels. - ```pids```: ```array```: PIDs corresponding to ```y```. - ```seed```: ```int```, optional: The random seed for the random processes. Defaults to ```0```. Returns -------- - ```out```: ```_type_``` : PID values that should be assigned to the training or testing set for each split. ''' labels = np.copy(y) labels[labels == 0] = -1 labels = labels.reshape(-1, ) pids = pids.reshape(-1, ) # make sure the train and test set got both positive and negative patients y_p_id = [] for p_id in np.unique(pids): _y = np.unique(y[pids == p_id]) rng = np.random.default_rng(seed) y_p_id.append(int(_y[0]) if len(_y) < 2 else rng.integers(0,2)) seed += 1 y_p_id = np.array(y_p_id) y_p_id[y_p_id < 0] = 0 splitter = StratifiedKFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=seed if self.shuffle else None) splits = list(splitter.split(np.unique(pids), y=y_p_id)) return [[np.unique(pids)[train_idx], np.unique(pids)[test_idx]] for train_idx, test_idx in splits] def split(self, X, y, pids): ''' This function produces the splits that can be used for training and testing. Arguments --------- - ```X```: ```array```: X input. This isn't used and so anything can be passed here. - ```y```: ```array```: The labels. This is used to stratify the data. - ```pids```: ```_type_```: The PIDs that is used to split the data. Returns -------- - ```out```: ```list``` : List of train-test splits. This list has length equal to ```n_splits```. ''' rng = np.random.default_rng(self.random_state) seed = rng.integers(0,1e6) list_of_splits_pids = self.split_by_ids(y=y, pids=pids, seed=seed) list_of_splits = [] for train_pids, test_pids in list_of_splits_pids: train_idx_new = np.arange(len(pids))[np.isin(pids, train_pids)] test_idx_new = np.arange(len(pids))[np.isin(pids, test_pids)] list_of_splits.append([ train_idx_new, test_idx_new, ]) return list_of_splits
Methods
def get_n_splits(self)
-
Returns the number of splits
Returns
out
:int
: The number of splits
Expand source code
def get_n_splits(self): ''' Returns the number of splits Returns -------- - ```out```: ```int``` : The number of splits ''' return self.n_splits
def split(self, X, y, pids)
-
This function produces the splits that can be used for training and testing.
Arguments
-
X
:array
: X input. This isn't used and so anything can be passed here. -
y
:array
: The labels. This is used to stratify the data. -
pids
:_type_
: The PIDs that is used to split the data.
Returns
out
:list
: List of train-test splits. This list has length equal ton_splits
.
Expand source code
def split(self, X, y, pids): ''' This function produces the splits that can be used for training and testing. Arguments --------- - ```X```: ```array```: X input. This isn't used and so anything can be passed here. - ```y```: ```array```: The labels. This is used to stratify the data. - ```pids```: ```_type_```: The PIDs that is used to split the data. Returns -------- - ```out```: ```list``` : List of train-test splits. This list has length equal to ```n_splits```. ''' rng = np.random.default_rng(self.random_state) seed = rng.integers(0,1e6) list_of_splits_pids = self.split_by_ids(y=y, pids=pids, seed=seed) list_of_splits = [] for train_pids, test_pids in list_of_splits_pids: train_idx_new = np.arange(len(pids))[np.isin(pids, train_pids)] test_idx_new = np.arange(len(pids))[np.isin(pids, test_pids)] list_of_splits.append([ train_idx_new, test_idx_new, ]) return list_of_splits
-
def split_by_ids(self, y, pids, seed=0)
-
An internal function that given a set of labels and PIDs corresponding to the labels, this function can return the pid values that should be assigned to the training or testing set for each split.
Arguments
-
y
:array
: Labels. -
pids
:array
: PIDs corresponding toy
. -
seed
:int
, optional: The random seed for the random processes. Defaults to0
.
Returns
out
:_type_
: PID values that should be assigned to the training or testing set for each split.
Expand source code
def split_by_ids(self, y, pids, seed=0): ''' An internal function that given a set of labels and PIDs corresponding to the labels, this function can return the pid values that should be assigned to the training or testing set for each split. Arguments --------- - ```y```: ```array```: Labels. - ```pids```: ```array```: PIDs corresponding to ```y```. - ```seed```: ```int```, optional: The random seed for the random processes. Defaults to ```0```. Returns -------- - ```out```: ```_type_``` : PID values that should be assigned to the training or testing set for each split. ''' labels = np.copy(y) labels[labels == 0] = -1 labels = labels.reshape(-1, ) pids = pids.reshape(-1, ) # make sure the train and test set got both positive and negative patients y_p_id = [] for p_id in np.unique(pids): _y = np.unique(y[pids == p_id]) rng = np.random.default_rng(seed) y_p_id.append(int(_y[0]) if len(_y) < 2 else rng.integers(0,2)) seed += 1 y_p_id = np.array(y_p_id) y_p_id[y_p_id < 0] = 0 splitter = StratifiedKFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=seed if self.shuffle else None) splits = list(splitter.split(np.unique(pids), y=y_p_id)) return [[np.unique(pids)[train_idx], np.unique(pids)[test_idx]] for train_idx, test_idx in splits]
-
-