Title: | User-Friendly R Package for Supervised Machine Learning Pipelines |
---|---|
Description: | An interface to build machine learning models for classification and regression problems. 'mikropml' implements the ML pipeline described by Topçuoğlu et al. (2020) <doi:10.1128/mBio.00434-20> with reasonable default options for data preprocessing, hyperparameter tuning, cross-validation, testing, model evaluation, and interpretation steps. See the website <https://www.schlosslab.org/mikropml/> for more information, documentation, and examples. |
Authors: | Begüm Topçuoğlu [aut] |
Maintainer: | Kelly Sovacool <[email protected]> |
License: | MIT + file LICENSE |
Version: | 1.6.1.9000 |
Built: | 2025-02-13 04:44:36 UTC |
Source: | https://github.com/schlosslab/mikropml |
Uses rsample::bootstraps()
, rsample::int_pctl()
, and furrr::future_map()
bootstrap_performance( ml_result, outcome_colname, bootstrap_times = 10000, alpha = 0.05 )
bootstrap_performance( ml_result, outcome_colname, bootstrap_times = 10000, alpha = 0.05 )
ml_result |
result returned from a single |
outcome_colname |
Column name as a string of the outcome variable
(default |
bootstrap_times |
the number of boostraps to create (default: |
alpha |
the alpha level for the confidence interval (default |
a data frame with an estimate (.estimate
), lower bound (.lower
),
and upper bound (.upper
) for each performance metric (term
).
Kelly Sovacool, [email protected]
bootstrap_performance(otu_mini_bin_results_glmnet, "dx", bootstrap_times = 10, alpha = 0.10 ) ## Not run: outcome_colname <- "dx" run_ml(otu_mini_bin, "rf", outcome_colname = "dx") %>% bootstrap_performance(outcome_colname, bootstrap_times = 10000, alpha = 0.05 ) ## End(Not run)
bootstrap_performance(otu_mini_bin_results_glmnet, "dx", bootstrap_times = 10, alpha = 0.10 ) ## Not run: outcome_colname <- "dx" run_ml(otu_mini_bin, "rf", outcome_colname = "dx") %>% bootstrap_performance(outcome_colname, bootstrap_times = 10000, alpha = 0.05 ) ## End(Not run)
Implements Equation 1 from Wu et al. 2021 doi:10.1016/j.ajhg.2021.08.012.
It is the same as Equation 7 if AUPRC
(aka prAUC
) is used in place of precision
.
calc_balanced_precision(precision, prior)
calc_balanced_precision(precision, prior)
precision |
actual precision of the model. |
prior |
baseline precision, aka frequency of positives. Can be calculated with calc_baseline_precision |
the expected precision if the data were balanced
Kelly Sovacool [email protected]
prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) calc_balanced_precision(otu_mini_bin_results_rf$performance$Precision, prior) otu_mini_bin_results_rf$performance %>% dplyr::mutate( balanced_precision = calc_balanced_precision(Precision, prior), aubprc = calc_balanced_precision(prAUC, prior) ) %>% dplyr::select(AUC, Precision, balanced_precision, aubprc) # cumulative performance for a single model sensspec_1 <- calc_model_sensspec( otu_mini_bin_results_glmnet$trained_model, otu_mini_bin_results_glmnet$test_data, "dx" ) head(sensspec_1) prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) sensspec_1 %>% dplyr::mutate(balanced_precision = calc_balanced_precision(precision, prior)) %>% dplyr::rename(recall = sensitivity) %>% calc_mean_perf(group_var = recall, sum_var = balanced_precision) %>% plot_mean_prc(ycol = mean_balanced_precision)
prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) calc_balanced_precision(otu_mini_bin_results_rf$performance$Precision, prior) otu_mini_bin_results_rf$performance %>% dplyr::mutate( balanced_precision = calc_balanced_precision(Precision, prior), aubprc = calc_balanced_precision(prAUC, prior) ) %>% dplyr::select(AUC, Precision, balanced_precision, aubprc) # cumulative performance for a single model sensspec_1 <- calc_model_sensspec( otu_mini_bin_results_glmnet$trained_model, otu_mini_bin_results_glmnet$test_data, "dx" ) head(sensspec_1) prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) sensspec_1 %>% dplyr::mutate(balanced_precision = calc_balanced_precision(precision, prior)) %>% dplyr::rename(recall = sensitivity) %>% calc_mean_perf(group_var = recall, sum_var = balanced_precision) %>% plot_mean_prc(ycol = mean_balanced_precision)
Calculate the fraction of positives, i.e. baseline precision for a PRC curve
calc_baseline_precision(dataset, outcome_colname = NULL, pos_outcome = NULL)
calc_baseline_precision(dataset, outcome_colname = NULL, pos_outcome = NULL)
dataset |
Data frame with an outcome variable and other columns as features. |
outcome_colname |
Column name as a string of the outcome variable
(default |
pos_outcome |
the positive outcome from |
the baseline precision based on the fraction of positives
Kelly Sovacool, [email protected]
# calculate the baseline precision data.frame(y = c("a", "b", "a", "b")) %>% calc_baseline_precision( outcome_colname = "y", pos_outcome = "a" ) calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) # if you're not sure which outcome was used as the 'positive' outcome during # model training, you can access it from the trained model and pass it along: calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = otu_mini_bin_results_glmnet$trained_model$levels[1] )
# calculate the baseline precision data.frame(y = c("a", "b", "a", "b")) %>% calc_baseline_precision( outcome_colname = "y", pos_outcome = "a" ) calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) # if you're not sure which outcome was used as the 'positive' outcome during # model training, you can access it from the trained model and pass it along: calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = otu_mini_bin_results_glmnet$trained_model$levels[1] )
Used by calc_mean_roc()
and calc_mean_prc()
.
calc_mean_perf(sensspec_dat, group_var = specificity, sum_var = sensitivity)
calc_mean_perf(sensspec_dat, group_var = specificity, sum_var = sensitivity)
sensspec_dat |
data frame created by concatenating results of
|
group_var |
variable to group by (e.g. specificity or recall). |
sum_var |
variable to summarize (e.g. sensitivity or precision). |
data frame with mean & standard deviation of sum_var
summarized over group_var
Courtney Armour
Kelly Sovacool
Use these functions to calculate cumulative sensitivity, specificity, recall, etc. on single models, concatenate the results together from multiple models, and compute mean ROC and PRC. You can then plot mean ROC and PRC curves to visualize the results. Note: These functions assume a binary outcome.
calc_model_sensspec(trained_model, test_data, outcome_colname = NULL) calc_mean_roc(sensspec_dat) calc_mean_prc(sensspec_dat)
calc_model_sensspec(trained_model, test_data, outcome_colname = NULL) calc_mean_roc(sensspec_dat) calc_mean_prc(sensspec_dat)
trained_model |
Trained model from |
test_data |
Held out test data: dataframe of outcome and features. |
outcome_colname |
Column name as a string of the outcome variable
(default |
sensspec_dat |
data frame created by concatenating results of
|
data frame with summarized performance
calc_model_sensspec()
: Get sensitivity, specificity, and precision for a model.
calc_mean_roc()
: Calculate mean sensitivity over specificity for multiple models
calc_mean_prc()
: Calculate mean precision over recall for multiple models
Courtney Armour
Kelly Sovacool, [email protected]
## Not run: library(dplyr) # get cumulative performance for a single model sensspec_1 <- calc_model_sensspec( otu_mini_bin_results_glmnet$trained_model, otu_mini_bin_results_glmnet$test_data, "dx" ) head(sensspec_1) # get performance for multiple models get_sensspec_seed <- function(seed) { ml_result <- run_ml(otu_mini_bin, "glmnet", seed = seed) sensspec <- calc_model_sensspec( ml_result$trained_model, ml_result$test_data, "dx" ) %>% dplyr::mutate(seed = seed) return(sensspec) } sensspec_dat <- purrr::map_dfr(seq(100, 102), get_sensspec_seed) # calculate mean sensitivity over specificity roc_dat <- calc_mean_roc(sensspec_dat) head(roc_dat) # calculate mean precision over recall prc_dat <- calc_mean_prc(sensspec_dat) head(prc_dat) # plot ROC & PRC roc_dat %>% plot_mean_roc() baseline_prec <- calc_baseline_precision(otu_mini_bin, "dx", "cancer") prc_dat %>% plot_mean_prc(baseline_precision = baseline_prec) # balanced precision prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) bprc_dat <- sensspec_dat %>% dplyr::mutate(balanced_precision = calc_balanced_precision(precision, prior)) %>% dplyr::rename(recall = sensitivity) %>% calc_mean_perf(group_var = recall, sum_var = balanced_precision) bprc_dat %>% plot_mean_prc(ycol = mean_balanced_precision) + ylab("Mean Bal. Precision") ## End(Not run)
## Not run: library(dplyr) # get cumulative performance for a single model sensspec_1 <- calc_model_sensspec( otu_mini_bin_results_glmnet$trained_model, otu_mini_bin_results_glmnet$test_data, "dx" ) head(sensspec_1) # get performance for multiple models get_sensspec_seed <- function(seed) { ml_result <- run_ml(otu_mini_bin, "glmnet", seed = seed) sensspec <- calc_model_sensspec( ml_result$trained_model, ml_result$test_data, "dx" ) %>% dplyr::mutate(seed = seed) return(sensspec) } sensspec_dat <- purrr::map_dfr(seq(100, 102), get_sensspec_seed) # calculate mean sensitivity over specificity roc_dat <- calc_mean_roc(sensspec_dat) head(roc_dat) # calculate mean precision over recall prc_dat <- calc_mean_prc(sensspec_dat) head(prc_dat) # plot ROC & PRC roc_dat %>% plot_mean_roc() baseline_prec <- calc_baseline_precision(otu_mini_bin, "dx", "cancer") prc_dat %>% plot_mean_prc(baseline_precision = baseline_prec) # balanced precision prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) bprc_dat <- sensspec_dat %>% dplyr::mutate(balanced_precision = calc_balanced_precision(precision, prior)) %>% dplyr::rename(recall = sensitivity) %>% calc_mean_perf(group_var = recall, sum_var = balanced_precision) bprc_dat %>% plot_mean_prc(ycol = mean_balanced_precision) + ylab("Mean Bal. Precision") ## End(Not run)
Get performance metrics for test data
calc_perf_metrics( test_data, trained_model, outcome_colname, perf_metric_function, class_probs )
calc_perf_metrics( test_data, trained_model, outcome_colname, perf_metric_function, class_probs )
test_data |
Held out test data: dataframe of outcome and features. |
trained_model |
Trained model from |
outcome_colname |
Column name as a string of the outcome variable
(default |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
class_probs |
Whether to use class probabilities (TRUE for categorical outcomes, FALSE for numeric outcomes). |
Dataframe of performance metrics.
Zena Lapp, [email protected]
## Not run: results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) calc_perf_metrics(results$test_data, results$trained_model, "dx", multiClassSummary, class_probs = TRUE ) ## End(Not run)
## Not run: results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) calc_perf_metrics(results$test_data, results$trained_model, "dx", multiClassSummary, class_probs = TRUE ) ## End(Not run)
Combine hyperparameter performance metrics for multiple train/test splits generated by, for instance, looping in R or using a snakemake workflow on a high-performance computer.
combine_hp_performance(trained_model_lst)
combine_hp_performance(trained_model_lst)
trained_model_lst |
List of trained models. |
Named list:
dat
: Dataframe of performance metric for each group of hyperparameters
params
: Hyperparameters tuned.
Metric
: Performance metric used.
Zena Lapp, [email protected]
## Not run: results <- lapply(seq(100, 102), function(seed) { run_ml(otu_small, "glmnet", seed = seed, cv_times = 2, kfold = 2) }) models <- lapply(results, function(x) x$trained_model) combine_hp_performance(models) ## End(Not run)
## Not run: results <- lapply(seq(100, 102), function(seed) { run_ml(otu_small, "glmnet", seed = seed, cv_times = 2, kfold = 2) }) models <- lapply(results, function(x) x$trained_model) combine_hp_performance(models) ## End(Not run)
A wrapper for permute_p_value()
.
compare_models(merged_data, metric, group_name, nperm = 10000)
compare_models(merged_data, metric, group_name, nperm = 10000)
merged_data |
the concatenated performance data from |
metric |
metric to compare, must be numeric |
group_name |
column with group variables to compare |
nperm |
number of permutations, default=10000 |
a table of p-values for all pairs of group variable
Courtney R Armour, [email protected]
df <- dplyr::tibble( model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) ) set.seed(123) compare_models(df, "AUC", "model", nperm = 10)
df <- dplyr::tibble( model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) ) set.seed(123) compare_models(df, "AUC", "model", nperm = 10)
Define cross-validation scheme and training parameters
define_cv( train_data, outcome_colname, hyperparams_list, perf_metric_function, class_probs, kfold = 5, cv_times = 100, groups = NULL, group_partitions = NULL )
define_cv( train_data, outcome_colname, hyperparams_list, perf_metric_function, class_probs, kfold = 5, cv_times = 100, groups = NULL, group_partitions = NULL )
train_data |
Dataframe for training model. |
outcome_colname |
Column name as a string of the outcome variable
(default |
hyperparams_list |
Named list of lists of hyperparameters. |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
class_probs |
Whether to use class probabilities (TRUE for categorical outcomes, FALSE for numeric outcomes). |
kfold |
Fold number for k-fold cross-validation (default: |
cv_times |
Number of cross-validation partitions to create (default: |
groups |
Vector of groups to keep together when splitting the data into
train and test sets. If the number of groups in the training set is larger
than |
group_partitions |
Specify how to assign |
Caret object for trainControl that controls cross-validation
Begüm Topçuoğlu, [email protected]
Kelly Sovacool, [email protected]
training_inds <- get_partition_indices(otu_small %>% dplyr::pull("dx"), training_frac = 0.8, groups = NULL ) train_data <- otu_small[training_inds, ] test_data <- otu_small[-training_inds, ] cv <- define_cv(train_data, outcome_colname = "dx", hyperparams_list = get_hyperparams_list(otu_small, "glmnet"), perf_metric_function = caret::multiClassSummary, class_probs = TRUE, kfold = 5 )
training_inds <- get_partition_indices(otu_small %>% dplyr::pull("dx"), training_frac = 0.8, groups = NULL ) train_data <- otu_small[training_inds, ] test_data <- otu_small[-training_inds, ] cv <- define_cv(train_data, outcome_colname = "dx", hyperparams_list = get_hyperparams_list(otu_small, "glmnet"), perf_metric_function = caret::multiClassSummary, class_probs = TRUE, kfold = 5 )
Get preprocessed dataframe for continuous variables
get_caret_processed_df(features, method)
get_caret_processed_df(features, method)
features |
Dataframe of features for machine learning |
method |
Methods to preprocess the data, described in
|
Named list:
processed
: Dataframe of processed features.
removed
: Names of any features removed during preprocessing.
Zena Lapp, [email protected]
get_caret_processed_df(mikropml::otu_small[, 2:ncol(otu_small)], c("center", "scale"))
get_caret_processed_df(mikropml::otu_small[, 2:ncol(otu_small)], c("center", "scale"))
Calculates feature importance using a trained model and test data. Requires
the future.apply
package.
get_feature_importance( trained_model, test_data, outcome_colname, perf_metric_function, perf_metric_name, class_probs, method, seed = NA, corr_thresh = 1, groups = NULL, nperms = 100, corr_method = "spearman" )
get_feature_importance( trained_model, test_data, outcome_colname, perf_metric_function, perf_metric_name, class_probs, method, seed = NA, corr_thresh = 1, groups = NULL, nperms = 100, corr_method = "spearman" )
trained_model |
Trained model from |
test_data |
Held out test data: dataframe of outcome and features. |
outcome_colname |
Column name as a string of the outcome variable
(default |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
perf_metric_name |
The column name from the output of the function
provided to perf_metric_function that is to be used as the performance metric.
Defaults: binary classification = |
class_probs |
Whether to use class probabilities (TRUE for categorical outcomes, FALSE for numeric outcomes). |
method |
ML method.
Options:
|
seed |
Random seed (default: |
corr_thresh |
For feature importance, group correlations
above or equal to |
groups |
Vector of feature names to group together during permutation.
Each element should be a string with feature names separated by a pipe
character ( |
nperms |
number of permutations to perform (default: |
corr_method |
correlation method. options or the same as those supported
by |
For permutation tests, the p-value is the number of permutation statistics that are greater than the test statistic, divided by the number of permutations. In our case, the permutation statistic is the model performance (e.g. AUROC) after randomizing the order of observations for one feature, and the test statistic is the actual performance on the test data. By default we perform 100 permutations per feature; increasing this will increase the precision of estimating the null distribution, but also increases runtime. The p-value represents the probability of obtaining the actual performance in the event that the null hypothesis is true, where the null hypothesis is that the feature is not important for model performance.
We strongly recommend providing multiple cores to speed up computation time. See our vignette on parallel processing for more details.
Data frame with performance metrics for when each feature (or group
of correlated features; feat
) is permuted (perf_metric
), differences
between the actual test performance metric on and the permuted performance
metric (perf_metric_diff
; test minus permuted performance), and the
p-value (pvalue
: the probability of obtaining the actual performance
value under the null hypothesis). Features with a larger perf_metric_diff
are more important. The performance metric name (perf_metric_name
) and
seed (seed
) are also returned.
Begüm Topçuoğlu, [email protected]
Zena Lapp, [email protected]
Kelly Sovacool, [email protected]
## Not run: # If you called `run_ml()` with `feature_importance = FALSE` (the default), # you can use `get_feature_importance()` later as long as you have the # trained model and test data. results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) # We strongly recommend providing multiple cores to speed up computation time. # Do this before calling `get_feature_importance()`. doFuture::registerDoFuture() future::plan(future::multicore, workers = 2) # Optionally, you can group features together with a custom grouping feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet", groups = c( "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012", "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022", "Otu00023", "Otu00025", "Otu00028", "Otu00029", "Otu00030", "Otu00035", "Otu00036", "Otu00037", "Otu00038", "Otu00039", "Otu00040", "Otu00047", "Otu00050", "Otu00052", "Otu00054", "Otu00055", "Otu00056", "Otu00060", "Otu00003|Otu00002|Otu00005|Otu00024|Otu00032|Otu00041|Otu00053", "Otu00014|Otu00021|Otu00017|Otu00031|Otu00057", "Otu00013|Otu00006", "Otu00026|Otu00001|Otu00034|Otu00048", "Otu00033|Otu00010", "Otu00042|Otu00004", "Otu00043|Otu00027|Otu00049", "Otu00051|Otu00045", "Otu00058|Otu00044", "Otu00059|Otu00046" ) ) # the function can show a progress bar if you have the `progressr` package installed. ## optionally, specify the progress bar format: progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, show_after = 0 )) ## tell progressr to always report progress progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) # You can specify any correlation method supported by `stats::cor`: feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet", corr_method = "pearson" ) ## End(Not run)
## Not run: # If you called `run_ml()` with `feature_importance = FALSE` (the default), # you can use `get_feature_importance()` later as long as you have the # trained model and test data. results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) # We strongly recommend providing multiple cores to speed up computation time. # Do this before calling `get_feature_importance()`. doFuture::registerDoFuture() future::plan(future::multicore, workers = 2) # Optionally, you can group features together with a custom grouping feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet", groups = c( "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012", "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022", "Otu00023", "Otu00025", "Otu00028", "Otu00029", "Otu00030", "Otu00035", "Otu00036", "Otu00037", "Otu00038", "Otu00039", "Otu00040", "Otu00047", "Otu00050", "Otu00052", "Otu00054", "Otu00055", "Otu00056", "Otu00060", "Otu00003|Otu00002|Otu00005|Otu00024|Otu00032|Otu00041|Otu00053", "Otu00014|Otu00021|Otu00017|Otu00031|Otu00057", "Otu00013|Otu00006", "Otu00026|Otu00001|Otu00034|Otu00048", "Otu00033|Otu00010", "Otu00042|Otu00004", "Otu00043|Otu00027|Otu00049", "Otu00051|Otu00045", "Otu00058|Otu00044", "Otu00059|Otu00046" ) ) # the function can show a progress bar if you have the `progressr` package installed. ## optionally, specify the progress bar format: progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, show_after = 0 )) ## tell progressr to always report progress progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) # You can specify any correlation method supported by `stats::cor`: feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet", corr_method = "pearson" ) ## End(Not run)
Get hyperparameter performance metrics
get_hp_performance(trained_model)
get_hp_performance(trained_model)
trained_model |
trained model (e.g. from |
Named list:
dat
: Dataframe of performance metric for each group of hyperparameters.
params
: Hyperparameters tuned.
metric
: Performance metric used.
Zena Lapp, [email protected]
Kelly Sovacool [email protected]
get_hp_performance(otu_mini_bin_results_glmnet$trained_model)
get_hp_performance(otu_mini_bin_results_glmnet$trained_model)
For more details see the vignette on hyperparameter tuning.
get_hyperparams_list(dataset, method)
get_hyperparams_list(dataset, method)
dataset |
Data frame with an outcome variable and other columns as features. |
method |
ML method.
Options:
|
Named list of hyperparameters.
Kelly Sovacool, [email protected]
get_hyperparams_list(otu_mini_bin, "rf") get_hyperparams_list(otu_small, "rf") get_hyperparams_list(otu_mini_bin, "rpart2") get_hyperparams_list(otu_small, "rpart2")
get_hyperparams_list(otu_mini_bin, "rf") get_hyperparams_list(otu_small, "rf") get_hyperparams_list(otu_mini_bin, "rpart2") get_hyperparams_list(otu_small, "rpart2")
If the outcome is numeric, the type is continuous. Otherwise, the outcome type is binary if there are only two outcomes or multiclass if there are more than two outcomes.
get_outcome_type(outcomes_vec)
get_outcome_type(outcomes_vec)
outcomes_vec |
Vector of outcomes. |
Outcome type (continuous, binary, or multiclass).
Zena Lapp, [email protected]
get_outcome_type(c(1, 2, 1)) get_outcome_type(c("a", "b", "b")) get_outcome_type(c("a", "b", "c"))
get_outcome_type(c(1, 2, 1)) get_outcome_type(c("a", "b", "b")) get_outcome_type(c("a", "b", "c"))
Use this function to get the row indices for the training set.
get_partition_indices( outcomes, training_frac = 0.8, groups = NULL, group_partitions = NULL )
get_partition_indices( outcomes, training_frac = 0.8, groups = NULL, group_partitions = NULL )
outcomes |
vector of outcomes |
training_frac |
Fraction of data for training set (default: |
groups |
Vector of groups to keep together when splitting the data into
train and test sets. If the number of groups in the training set is larger
than |
group_partitions |
Specify how to assign |
If groups
is NULL
, uses createDataPartition.
Otherwise, uses create_grouped_data_partition()
.
Set the seed prior to calling this function if you would like your data partitions to be reproducible (recommended).
Vector of row indices for the training set.
Kelly Sovacool, [email protected]
training_inds <- get_partition_indices(otu_mini_bin$dx) train_data <- otu_mini_bin[training_inds, ] test_data <- otu_mini_bin[-training_inds, ]
training_inds <- get_partition_indices(otu_mini_bin$dx) train_data <- otu_mini_bin[training_inds, ] test_data <- otu_mini_bin[-training_inds, ]
Get default performance metric function
get_perf_metric_fn(outcome_type)
get_perf_metric_fn(outcome_type)
outcome_type |
Type of outcome (one of: |
Performance metric function.
Zena Lapp, [email protected]
get_perf_metric_fn("continuous") get_perf_metric_fn("binary") get_perf_metric_fn("multiclass")
get_perf_metric_fn("continuous") get_perf_metric_fn("binary") get_perf_metric_fn("multiclass")
Get default performance metric name for cross-validation.
get_perf_metric_name(outcome_type)
get_perf_metric_name(outcome_type)
outcome_type |
Type of outcome (one of: |
Performance metric name.
Zena Lapp, [email protected]
get_perf_metric_name("continuous") get_perf_metric_name("binary") get_perf_metric_name("multiclass")
get_perf_metric_name("continuous") get_perf_metric_name("binary") get_perf_metric_name("multiclass")
Get model performance metrics as a one-row tibble
get_performance_tbl( trained_model, test_data, outcome_colname, perf_metric_function, perf_metric_name, class_probs, method, seed = NA )
get_performance_tbl( trained_model, test_data, outcome_colname, perf_metric_function, perf_metric_name, class_probs, method, seed = NA )
trained_model |
Trained model from |
test_data |
Held out test data: dataframe of outcome and features. |
outcome_colname |
Column name as a string of the outcome variable
(default |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
perf_metric_name |
The column name from the output of the function
provided to perf_metric_function that is to be used as the performance metric.
Defaults: binary classification = |
class_probs |
Whether to use class probabilities (TRUE for categorical outcomes, FALSE for numeric outcomes). |
method |
ML method.
Options:
|
seed |
Random seed (default: |
A one-row tibble with a column for the cross-validation performance,
columns for each of the performance metrics for the test data,
plus the method
, and seed
.
Kelly Sovacool, [email protected]
Zena Lapp, [email protected]
## Not run: results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" get_performance_tbl(results$trained_model, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) ## End(Not run)
## Not run: results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" get_performance_tbl(results$trained_model, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) ## End(Not run)
Generate the tuning grid for tuning hyperparameters
get_tuning_grid(hyperparams_list, method)
get_tuning_grid(hyperparams_list, method)
hyperparams_list |
Named list of lists of hyperparameters. |
method |
ML method.
Options:
|
The tuning grid.
Begüm Topçuoğlu, [email protected]
Kelly Sovacool, [email protected]
ml_method <- "glmnet" hparams_list <- get_hyperparams_list(otu_small, ml_method) get_tuning_grid(hparams_list, ml_method)
ml_method <- "glmnet" hparams_list <- get_hyperparams_list(otu_small, ml_method) get_tuning_grid(hparams_list, ml_method)
This is the result of running preprocess_data("otu_mini_bin")
otu_data_preproc
otu_data_preproc
An object of class list
of length 3.
A dataset containing relatives abundances of OTUs for human stool samples
with a binary outcome, dx
.
This is a subset of otu_small
.
otu_mini_bin
otu_mini_bin
A data frame
The dx
column is the diagnosis: healthy or cancerous (colorectal).
All other columns are OTU relative abundances.
otu_mini_bin
with feature importance and groupingResults from running the pipeline with L2 logistic regression on otu_mini_bin
with feature importance and grouping
otu_mini_bin_results_glmnet
otu_mini_bin_results_glmnet
An object of class list
of length 4.
otu_mini_bin
Results from running the pipeline with random forest on otu_mini_bin
otu_mini_bin_results_rf
otu_mini_bin_results_rf
An object of class list
of length 4.
otu_mini_bin
Results from running the pipeline with rpart2 on otu_mini_bin
otu_mini_bin_results_rpart2
otu_mini_bin_results_rpart2
An object of class list
of length 4.
otu_mini_bin
Results from running the pipeline with svmRadial on otu_mini_bin
otu_mini_bin_results_svmRadial
otu_mini_bin_results_svmRadial
An object of class list
of length 4.
otu_mini_bin
Results from running the pipeline with xbgTree on otu_mini_bin
otu_mini_bin_results_xgbTree
otu_mini_bin_results_xgbTree
An object of class list
of length 4.
otu_mini_bin
with Otu00001
as the outcomeResults from running the pipeline with glmnet on otu_mini_bin
with Otu00001
as the outcome
otu_mini_cont_results_glmnet
otu_mini_cont_results_glmnet
An object of class list
of length 4.
otu_mini_bin
with Otu00001
as the outcome column,
using a custom train control scheme that does not perform cross-validationResults from running the pipeline with glmnet on otu_mini_bin
with Otu00001
as the outcome column,
using a custom train control scheme that does not perform cross-validation
otu_mini_cont_results_nocv
otu_mini_cont_results_nocv
An object of class list
of length 4.
train_data_mini
with grouped features.Cross validation on train_data_mini
with grouped features.
otu_mini_cv
otu_mini_cv
An object of class list
of length 27.
A dataset containing relatives abundances of OTUs for human stool samples
otu_mini_multi
otu_mini_multi
A data frame
The dx
column is the colorectal cancer diagnosis: adenoma, carcinoma, normal.
All other columns are OTU relative abundances.
Groups for otu_mini_multi
otu_mini_multi_group
otu_mini_multi_group
An object of class character
of length 490.
otu_mini_multi
for
multiclass outcomesResults from running the pipeline with glmnet on otu_mini_multi
for
multiclass outcomes
otu_mini_multi_results_glmnet
otu_mini_multi_results_glmnet
An object of class list
of length 4.
A dataset containing relatives abundances of 60 OTUs for 60 human stool samples.
This is a subset of the data provided in extdata/otu_large.csv
, which was
used in Topçuoğlu et al. 2020.
otu_small
otu_small
A data frame with 60 rows and 61 variables.
The dx
column is the diagnosis: healthy or cancerous (colorectal).
All other columns are OTU relative abundances.
Calculated a permuted p-value comparing two models
permute_p_value( merged_data, metric, group_name, group_1, group_2, nperm = 10000 )
permute_p_value( merged_data, metric, group_name, group_1, group_2, nperm = 10000 )
merged_data |
the concatenated performance data from |
metric |
metric to compare, must be numeric |
group_name |
column with group variables to compare |
group_1 |
name of one group to compare |
group_2 |
name of other group to compare |
nperm |
number of permutations, default=10000 |
numeric p-value comparing two models
Begüm Topçuoğlu, [email protected]
Courtney R Armour, [email protected]
df <- dplyr::tibble( model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) ) set.seed(123) permute_p_value(df, "AUC", "model", "rf", "glmnet", nperm = 100)
df <- dplyr::tibble( model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) ) set.seed(123) permute_p_value(df, "AUC", "model", "rf", "glmnet", nperm = 100)
Plot hyperparameter performance metrics
plot_hp_performance(dat, param_col, metric_col)
plot_hp_performance(dat, param_col, metric_col)
dat |
dataframe of hyperparameters and performance metric (e.g. from |
param_col |
hyperparameter to be plotted. must be a column in |
metric_col |
performance metric. must be a column in |
ggplot of hyperparameter performance.
Zena Lapp, [email protected]
Kelly Sovacool [email protected]
# plot for a single `run_ml()` call hp_metrics <- get_hp_performance(otu_mini_bin_results_glmnet$trained_model) hp_metrics plot_hp_performance(hp_metrics$dat, lambda, AUC) ## Not run: # plot for multiple `run_ml()` calls results <- lapply(seq(100, 102), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) models <- lapply(results, function(x) x$trained_model) hp_metrics <- combine_hp_performance(models) plot_hp_performance(hp_metrics$dat, lambda, AUC) ## End(Not run)
# plot for a single `run_ml()` call hp_metrics <- get_hp_performance(otu_mini_bin_results_glmnet$trained_model) hp_metrics plot_hp_performance(hp_metrics$dat, lambda, AUC) ## Not run: # plot for multiple `run_ml()` calls results <- lapply(seq(100, 102), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) models <- lapply(results, function(x) x$trained_model) hp_metrics <- combine_hp_performance(models) plot_hp_performance(hp_metrics$dat, lambda, AUC) ## End(Not run)
Plot ROC and PRC curves
plot_mean_roc(dat, ribbon_fill = "#C6DBEF", line_color = "#08306B") plot_mean_prc( dat, baseline_precision = NULL, ycol = mean_precision, ribbon_fill = "#C7E9C0", line_color = "#00441B" )
plot_mean_roc(dat, ribbon_fill = "#C6DBEF", line_color = "#08306B") plot_mean_prc( dat, baseline_precision = NULL, ycol = mean_precision, ribbon_fill = "#C7E9C0", line_color = "#00441B" )
dat |
sensitivity, specificity, and precision data calculated by |
ribbon_fill |
ribbon fill color (default: "#D9D9D9") |
line_color |
line color (default: "#000000") |
baseline_precision |
baseline precision from |
ycol |
column for the y axis (Default: |
plot_mean_roc()
: Plot mean sensitivity over specificity
plot_mean_prc()
: Plot mean precision over recall
Courtney Armour
Kelly Sovacool [email protected]
## Not run: library(dplyr) # get performance for multiple models get_sensspec_seed <- function(seed) { ml_result <- run_ml(otu_mini_bin, "glmnet", seed = seed) sensspec <- calc_model_sensspec( ml_result$trained_model, ml_result$test_data, "dx" ) %>% mutate(seed = seed) return(sensspec) } sensspec_dat <- purrr::map_dfr(seq(100, 102), get_sensspec_seed) # plot ROC & PRC sensspec_dat %>% calc_mean_roc() %>% plot_mean_roc() baseline_prec <- calc_baseline_precision(otu_mini_bin, "dx", "cancer") sensspec_dat %>% calc_mean_prc() %>% plot_mean_prc(baseline_precision = baseline_prec) ## End(Not run)
## Not run: library(dplyr) # get performance for multiple models get_sensspec_seed <- function(seed) { ml_result <- run_ml(otu_mini_bin, "glmnet", seed = seed) sensspec <- calc_model_sensspec( ml_result$trained_model, ml_result$test_data, "dx" ) %>% mutate(seed = seed) return(sensspec) } sensspec_dat <- purrr::map_dfr(seq(100, 102), get_sensspec_seed) # plot ROC & PRC sensspec_dat %>% calc_mean_roc() %>% plot_mean_roc() baseline_prec <- calc_baseline_precision(otu_mini_bin, "dx", "cancer") sensspec_dat %>% calc_mean_prc() %>% plot_mean_prc(baseline_precision = baseline_prec) ## End(Not run)
ggplot2 is required to use this function.
plot_model_performance(performance_df)
plot_model_performance(performance_df)
performance_df |
dataframe of performance results from multiple calls to |
A ggplot2 plot of performance.
Begüm Topçuoglu, [email protected]
Kelly Sovacool, [email protected]
## Not run: # call `run_ml()` multiple times with different seeds results_lst <- lapply(seq(100, 104), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) # extract and combine the performance results perf_df <- lapply(results_lst, function(result) { result[["performance"]] }) %>% dplyr::bind_rows() # plot the performance results p <- plot_model_performance(perf_df) # call `run_ml()` with different ML methods param_grid <- expand.grid( seeds = seq(100, 104), methods = c("glmnet", "rf") ) results_mtx <- mapply( function(seed, method) { run_ml(otu_mini_bin, method, seed = seed, kfold = 2) }, param_grid$seeds, param_grid$methods ) # extract and combine the performance results perf_df2 <- dplyr::bind_rows(results_mtx["performance", ]) # plot the performance results p <- plot_model_performance(perf_df2) # you can continue adding layers to customize the plot p + theme_classic() + scale_color_brewer(palette = "Dark2") + coord_flip() ## End(Not run)
## Not run: # call `run_ml()` multiple times with different seeds results_lst <- lapply(seq(100, 104), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) # extract and combine the performance results perf_df <- lapply(results_lst, function(result) { result[["performance"]] }) %>% dplyr::bind_rows() # plot the performance results p <- plot_model_performance(perf_df) # call `run_ml()` with different ML methods param_grid <- expand.grid( seeds = seq(100, 104), methods = c("glmnet", "rf") ) results_mtx <- mapply( function(seed, method) { run_ml(otu_mini_bin, method, seed = seed, kfold = 2) }, param_grid$seeds, param_grid$methods ) # extract and combine the performance results perf_df2 <- dplyr::bind_rows(results_mtx["performance", ]) # plot the performance results p <- plot_model_performance(perf_df2) # you can continue adding layers to customize the plot p + theme_classic() + scale_color_brewer(palette = "Dark2") + coord_flip() ## End(Not run)
Function to preprocess your data for input into run_ml()
.
preprocess_data( dataset, outcome_colname, method = c("center", "scale"), remove_var = "nzv", collapse_corr_feats = TRUE, to_numeric = TRUE, group_neg_corr = TRUE, prefilter_threshold = 1 )
preprocess_data( dataset, outcome_colname, method = c("center", "scale"), remove_var = "nzv", collapse_corr_feats = TRUE, to_numeric = TRUE, group_neg_corr = TRUE, prefilter_threshold = 1 )
dataset |
Data frame with an outcome variable and other columns as features. |
outcome_colname |
Column name as a string of the outcome variable
(default |
method |
Methods to preprocess the data, described in
|
remove_var |
Whether to remove variables with near-zero variance
( |
collapse_corr_feats |
Whether to keep only one of perfectly correlated features. |
to_numeric |
Whether to change features to numeric where possible. |
group_neg_corr |
Whether to group negatively correlated features together (e.g. c(0,1) and c(1,0)). |
prefilter_threshold |
Remove features which only have non-zero & non-NA
values N rows or fewer (default: 1). Set this to -1 to keep all columns at
this step. This step will also be skipped if |
Named list including:
dat_transformed
: Preprocessed data.
grp_feats
: If features were grouped together, a named list of the features corresponding to each group.
removed_feats
: Any features that were removed during preprocessing (e.g. because there was zero variance or near-zero variance for those features).
If the progressr
package is installed, a progress bar with time elapsed
and estimated time to completion can be displayed.
See the preprocessing vignette for more details.
Note that if any values in outcome_colname
contain spaces, they will be
converted to underscores for compatibility with caret
.
Zena Lapp, [email protected]
Kelly Sovacool, [email protected]
preprocess_data(mikropml::otu_small, "dx") # the function can show a progress bar if you have the progressr package installed ## optionally, specify the progress bar format progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, show_after = 0 )) ## tell progressor to always report progress ## Not run: progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates dat_preproc <- preprocess_data(mikropml::otu_small, "dx") ## End(Not run)
preprocess_data(mikropml::otu_small, "dx") # the function can show a progress bar if you have the progressr package installed ## optionally, specify the progress bar format progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, show_after = 0 )) ## tell progressor to always report progress ## Not run: progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates dat_preproc <- preprocess_data(mikropml::otu_small, "dx") ## End(Not run)
Randomize feature order to eliminate any position-dependent effects
randomize_feature_order(dataset, outcome_colname)
randomize_feature_order(dataset, outcome_colname)
dataset |
Data frame with an outcome variable and other columns as features. |
outcome_colname |
Column name as a string of the outcome variable
(default |
Dataset with feature order randomized.
Nick Lesniak, [email protected]
Kelly Sovacool, [email protected]
dat <- data.frame( outcome = c("1", "2", "3"), a = 4:6, b = 7:9, c = 10:12, d = 13:15 ) randomize_feature_order(dat, "outcome")
dat <- data.frame( outcome = c("1", "2", "3"), a = 4:6, b = 7:9, c = 10:12, d = 13:15 ) randomize_feature_order(dat, "outcome")
threshold
row(s) or fewer.Removes columns which only have non-zero & non-NA values in threshold
row(s) or fewer.
remove_singleton_columns(dat, threshold = 1)
remove_singleton_columns(dat, threshold = 1)
dat |
dataframe |
threshold |
Number of rows. If a column only has non-zero & non-NA values
in |
dataframe without singleton columns
Kelly Sovacool, [email protected]
Courtney Armour
remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, 0), c = 4:6)) remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, 0), c = 4:6), threshold = 0) remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, NA), c = 4:6)) remove_singleton_columns(data.frame(a = 1:3, b = c(1, 1, 1), c = 4:6))
remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, 0), c = 4:6)) remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, 0), c = 4:6), threshold = 0) remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, NA), c = 4:6)) remove_singleton_columns(data.frame(a = 1:3, b = c(1, 1, 1), c = 4:6))
Replace spaces in all elements of a character vector with underscores
replace_spaces(x, new_char = "_")
replace_spaces(x, new_char = "_")
x |
a character vector |
new_char |
the character to replace spaces (default: |
character vector with all spaces replaced with new_char
Kelly Sovacool, [email protected]
dat <- data.frame( dx = c("outcome 1", "outcome 2", "outcome 1"), a = 1:3, b = c(5, 7, 1) ) dat$dx <- replace_spaces(dat$dx) dat
dat <- data.frame( dx = c("outcome 1", "outcome 2", "outcome 1"), a = 1:3, b = c(5, 7, 1) ) dat$dx <- replace_spaces(dat$dx) dat
This function splits the data set into a train & test set,
trains machine learning (ML) models using k-fold cross-validation,
evaluates the best model on the held-out test set,
and optionally calculates feature importance using the framework
outlined in Topçuoğlu et al. 2020 (doi:10.1128/mBio.00434-20).
Required inputs are a data frame (must contain an outcome variable and all
other columns as features) and the ML method.
See vignette('introduction')
for more details.
run_ml( dataset, method, outcome_colname = NULL, hyperparameters = NULL, find_feature_importance = FALSE, calculate_performance = TRUE, kfold = 5, cv_times = 100, cross_val = NULL, training_frac = 0.8, perf_metric_function = NULL, perf_metric_name = NULL, groups = NULL, group_partitions = NULL, corr_thresh = 1, seed = NA, ... )
run_ml( dataset, method, outcome_colname = NULL, hyperparameters = NULL, find_feature_importance = FALSE, calculate_performance = TRUE, kfold = 5, cv_times = 100, cross_val = NULL, training_frac = 0.8, perf_metric_function = NULL, perf_metric_name = NULL, groups = NULL, group_partitions = NULL, corr_thresh = 1, seed = NA, ... )
dataset |
Data frame with an outcome variable and other columns as features. |
method |
ML method.
Options:
|
outcome_colname |
Column name as a string of the outcome variable
(default |
hyperparameters |
Dataframe of hyperparameters
(default |
find_feature_importance |
Run permutation importance (default: |
calculate_performance |
Whether to calculate performance metrics (default: |
kfold |
Fold number for k-fold cross-validation (default: |
cv_times |
Number of cross-validation partitions to create (default: |
cross_val |
a custom cross-validation scheme from |
training_frac |
Fraction of data for training set (default: |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
perf_metric_name |
The column name from the output of the function
provided to perf_metric_function that is to be used as the performance metric.
Defaults: binary classification = |
groups |
Vector of groups to keep together when splitting the data into
train and test sets. If the number of groups in the training set is larger
than |
group_partitions |
Specify how to assign |
corr_thresh |
For feature importance, group correlations
above or equal to |
seed |
Random seed (default: |
... |
All additional arguments are passed on to |
Named list with results:
trained_model
: Output of caret::train()
, including the best model.
test_data
: Part of the data that was used for testing.
performance
: Data frame of performance metrics. The first column is the
cross-validation performance metric, and the last two columns are the ML
method used and the seed (if one was set), respectively.
All other columns are performance metrics calculated on the test data.
This contains only one row, so you can easily combine performance
data frames from multiple calls to run_ml()
(see vignette("parallel")
).
feature_importance
: If feature importances were calculated, a data frame
where each row is a feature or correlated group. The columns are the
performance metric of the permuted data, the difference between the true
performance metric and the performance metric of the permuted data
(true - permuted), the feature name, the ML method,
the performance metric name, and the seed (if provided).
For AUC and RMSE, the higher perf_metric_diff is, the more important that
feature is for predicting the outcome. For log loss, the lower
perf_metric_diff is, the more important that feature is for
predicting the outcome.
For more details, please see the vignettes.
Begüm Topçuoğlu, [email protected]
Zena Lapp, [email protected]
Kelly Sovacool, [email protected]
## Not run: # regression run_ml(otu_small, "glmnet", seed = 2019 ) # random forest w/ feature importance run_ml(otu_small, "rf", outcome_colname = "dx", find_feature_importance = TRUE ) # custom cross validation & hyperparameters run_ml(otu_mini_bin[, 2:11], "glmnet", outcome_colname = "Otu00001", seed = 2019, hyperparameters = list(lambda = c(1e-04), alpha = 0), cross_val = caret::trainControl(method = "none"), calculate_performance = FALSE ) ## End(Not run)
## Not run: # regression run_ml(otu_small, "glmnet", seed = 2019 ) # random forest w/ feature importance run_ml(otu_small, "rf", outcome_colname = "dx", find_feature_importance = TRUE ) # custom cross validation & hyperparameters run_ml(otu_mini_bin[, 2:11], "glmnet", outcome_colname = "Otu00001", seed = 2019, hyperparameters = list(lambda = c(1e-04), alpha = 0), cross_val = caret::trainControl(method = "none"), calculate_performance = FALSE ) ## End(Not run)
Used by plot_model_performance()
.
tidy_perf_data(performance_df)
tidy_perf_data(performance_df)
performance_df |
dataframe of performance results from multiple calls to |
Tidy dataframe with model performance metrics.
Begüm Topçuoglu, [email protected]
Kelly Sovacool, [email protected]
## Not run: # call `run_ml()` multiple times with different seeds results_lst <- lapply(seq(100, 104), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) # extract and combine the performance results perf_df <- lapply(results_lst, function(result) { result[["performance"]] }) %>% dplyr::bind_rows() # make it pretty! tidy_perf_data(perf_df) ## End(Not run)
## Not run: # call `run_ml()` multiple times with different seeds results_lst <- lapply(seq(100, 104), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) # extract and combine the performance results perf_df <- lapply(results_lst, function(result) { result[["performance"]] }) %>% dplyr::bind_rows() # make it pretty! tidy_perf_data(perf_df) ## End(Not run)
caret::train()
.Train model using caret::train()
.
train_model( train_data, outcome_colname, method, cv, perf_metric_name, tune_grid, ... )
train_model( train_data, outcome_colname, method, cv, perf_metric_name, tune_grid, ... )
train_data |
Training data. Expected to be a subset of the full dataset. |
outcome_colname |
Column name as a string of the outcome variable
(default |
method |
ML method.
Options:
|
cv |
Cross-validation caret scheme from |
perf_metric_name |
The column name from the output of the function
provided to perf_metric_function that is to be used as the performance metric.
Defaults: binary classification = |
tune_grid |
Tuning grid from |
... |
All additional arguments are passed on to |
Trained model from caret::train()
.
Zena Lapp, [email protected]
## Not run: training_data <- otu_mini_bin_results_glmnet$trained_model$trainingData %>% dplyr::rename(dx = .outcome) method <- "rf" hyperparameters <- get_hyperparams_list(otu_mini_bin, method) cross_val <- define_cv(training_data, "dx", hyperparameters, perf_metric_function = caret::multiClassSummary, class_probs = TRUE, cv_times = 2 ) tune_grid <- get_tuning_grid(hyperparameters, method) rf_model <- train_model( training_data, "dx", method, cross_val, "AUC", tune_grid, ntree = 1000 ) rf_model$results %>% dplyr::select(mtry, AUC, prAUC) ## End(Not run)
## Not run: training_data <- otu_mini_bin_results_glmnet$trained_model$trainingData %>% dplyr::rename(dx = .outcome) method <- "rf" hyperparameters <- get_hyperparams_list(otu_mini_bin, method) cross_val <- define_cv(training_data, "dx", hyperparameters, perf_metric_function = caret::multiClassSummary, class_probs = TRUE, cv_times = 2 ) tune_grid <- get_tuning_grid(hyperparameters, method) rf_model <- train_model( training_data, "dx", method, cross_val, "AUC", tune_grid, ntree = 1000 ) rf_model$results %>% dplyr::select(mtry, AUC, prAUC) ## End(Not run)