Module `minder_utils.scripts.feature_selection`

Expand source code

from minder_utils.models.feature_selectors.supervised.filter import Supervised_Filter
from minder_utils.models.feature_selectors.unsupervised.filter import Unsupervised_Filter
from prettytable import PrettyTable
from minder_utils.evaluate.eval_utils import split_by_ids, get_scores
from minder_utils.formatting.format_util import format_mean_std
import pandas as pd


class Feature_Selection:
    '''
    This class can help you to choose the best feature selection method for your model.

    To run the script, your model must implement ```reset()``` method, which is to
    reset the parameters of your model for testing.

    ```Example```
    ```
    from minder_utils.scripts.feature_selection import Feature_Selection
    from minder_utils.models.classifiers import Classifiers

    classifier = Classifiers('nn')
    feature_select = Feature_Selection(classifier)

    feature_select.evaluate(X, y, p_ids, num_runs=10)
    ```
    '''
    def __init__(self, model, proportion=90):
        self.model = model
        self.feature_selector = [Supervised_Filter(proportion=proportion), Unsupervised_Filter()]

    def evaluate(self, X, y, p_ids, num_runs=10):
        header = ['Supervision', 'Method', 'Feature selected ({} in total)'.format(X.shape[1]), 'sensitivity', 'specificity', 'acc', 'f1']
        results = []
        for selector in self.feature_selector:
            sen, spe, accs, f1s, num_feats = [], [], [], [], []
            info = selector.get_info()
            for model_type in info:
                for run in range(num_runs):
                    X_train, y_train, X_test, y_test = split_by_ids(X, y, p_ids, seed=run)
                    self.model.reset()
                    selector.reset_model(model_type)
                    selector.fit(X_train, y_train)
                    X_train_trans = selector.transform(X_train)
                    X_test_trans = selector.transform(X_test)
                    self.model.fit(X_train_trans, y_train)
                    sensitivity, specificity, acc, f1 = get_scores(y_test, self.model.predict(X_test_trans))
                    if sensitivity is not None and str(sensitivity) != 'nan':
                        sen.append(sensitivity)
                        spe.append(specificity)
                        accs.append(acc)
                        f1s.append(f1)
                        num_feats.append(X_train_trans.shape[1])
                row = [selector.__name__()[0], selector.__name__()[1], format_mean_std(num_feats),
                       format_mean_std(sen), format_mean_std(spe), format_mean_std(accs), format_mean_std(f1s)]
                results.append(row)
        df_results = pd.DataFrame(results, columns=header)
        return df_results

Classes

class Feature_Selection (model, proportion=90)

This class can help you to choose the best feature selection method for your model.

To run the script, your model must implement reset() method, which is to reset the parameters of your model for testing.

Example

from minder_utils.scripts.feature_selection import Feature_Selection
from minder_utils.models.classifiers import Classifiers

classifier = Classifiers('nn')
feature_select = Feature_Selection(classifier)

feature_select.evaluate(X, y, p_ids, num_runs=10)

Expand source code

class Feature_Selection:
    '''
    This class can help you to choose the best feature selection method for your model.

    To run the script, your model must implement ```reset()``` method, which is to
    reset the parameters of your model for testing.

    ```Example```
    ```
    from minder_utils.scripts.feature_selection import Feature_Selection
    from minder_utils.models.classifiers import Classifiers

    classifier = Classifiers('nn')
    feature_select = Feature_Selection(classifier)

    feature_select.evaluate(X, y, p_ids, num_runs=10)
    ```
    '''
    def __init__(self, model, proportion=90):
        self.model = model
        self.feature_selector = [Supervised_Filter(proportion=proportion), Unsupervised_Filter()]

    def evaluate(self, X, y, p_ids, num_runs=10):
        header = ['Supervision', 'Method', 'Feature selected ({} in total)'.format(X.shape[1]), 'sensitivity', 'specificity', 'acc', 'f1']
        results = []
        for selector in self.feature_selector:
            sen, spe, accs, f1s, num_feats = [], [], [], [], []
            info = selector.get_info()
            for model_type in info:
                for run in range(num_runs):
                    X_train, y_train, X_test, y_test = split_by_ids(X, y, p_ids, seed=run)
                    self.model.reset()
                    selector.reset_model(model_type)
                    selector.fit(X_train, y_train)
                    X_train_trans = selector.transform(X_train)
                    X_test_trans = selector.transform(X_test)
                    self.model.fit(X_train_trans, y_train)
                    sensitivity, specificity, acc, f1 = get_scores(y_test, self.model.predict(X_test_trans))
                    if sensitivity is not None and str(sensitivity) != 'nan':
                        sen.append(sensitivity)
                        spe.append(specificity)
                        accs.append(acc)
                        f1s.append(f1)
                        num_feats.append(X_train_trans.shape[1])
                row = [selector.__name__()[0], selector.__name__()[1], format_mean_std(num_feats),
                       format_mean_std(sen), format_mean_std(spe), format_mean_std(accs), format_mean_std(f1s)]
                results.append(row)
        df_results = pd.DataFrame(results, columns=header)
        return df_results

Methods

def evaluate(self, X, y, p_ids, num_runs=10)

Expand source code

def evaluate(self, X, y, p_ids, num_runs=10):
    header = ['Supervision', 'Method', 'Feature selected ({} in total)'.format(X.shape[1]), 'sensitivity', 'specificity', 'acc', 'f1']
    results = []
    for selector in self.feature_selector:
        sen, spe, accs, f1s, num_feats = [], [], [], [], []
        info = selector.get_info()
        for model_type in info:
            for run in range(num_runs):
                X_train, y_train, X_test, y_test = split_by_ids(X, y, p_ids, seed=run)
                self.model.reset()
                selector.reset_model(model_type)
                selector.fit(X_train, y_train)
                X_train_trans = selector.transform(X_train)
                X_test_trans = selector.transform(X_test)
                self.model.fit(X_train_trans, y_train)
                sensitivity, specificity, acc, f1 = get_scores(y_test, self.model.predict(X_test_trans))
                if sensitivity is not None and str(sensitivity) != 'nan':
                    sen.append(sensitivity)
                    spe.append(specificity)
                    accs.append(acc)
                    f1s.append(f1)
                    num_feats.append(X_train_trans.shape[1])
            row = [selector.__name__()[0], selector.__name__()[1], format_mean_std(num_feats),
                   format_mean_std(sen), format_mean_std(spe), format_mean_std(accs), format_mean_std(f1s)]
            results.append(row)
    df_results = pd.DataFrame(results, columns=header)
    return df_results