| Title: | User-Friendly R Package for Supervised Machine Learning Pipelines |
|---|---|
| Description: | An interface to build machine learning models for classification and regression problems. 'mikropml' implements the ML pipeline described by Topçuoğlu et al. (2020) <doi:10.1128/mBio.00434-20> with reasonable default options for data preprocessing, hyperparameter tuning, cross-validation, testing, model evaluation, and interpretation steps. See the website <https://www.schlosslab.org/mikropml/> for more information, documentation, and examples. |
| Authors: | Begüm Topçuoğlu [aut] (ORCID: <https://orcid.org/0000-0003-3140-537X>), Zena Lapp [aut] (ORCID: <https://orcid.org/0000-0003-4674-2176>), Kelly Sovacool [aut, cre] (ORCID: <https://orcid.org/0000-0003-3283-829X>), Evan Snitkin [aut] (ORCID: <https://orcid.org/0000-0001-8409-278X>), Jenna Wiens [aut] (ORCID: <https://orcid.org/0000-0002-1057-7722>), Patrick Schloss [aut] (ORCID: <https://orcid.org/0000-0002-6935-4275>), Nick Lesniak [ctb] (ORCID: <https://orcid.org/0000-0001-9359-5194>), Courtney Armour [ctb] (ORCID: <https://orcid.org/0000-0002-5250-1224>), Sarah Lucas [ctb] (ORCID: <https://orcid.org/0000-0003-1676-5801>), Tuomas Borman [ctb] (ORCID: <https://orcid.org/0000-0002-8563-8884>) |
| Maintainer: | Kelly Sovacool <[email protected]> |
| License: | MIT + file LICENSE |
| Version: | 1.7.0.9000 |
| Built: | 2026-06-01 10:22:24 UTC |
| Source: | https://github.com/schlosslab/mikropml |
Uses rsample::bootstraps(), rsample::int_pctl(), and furrr::future_map()
bootstrap_performance( ml_result, outcome_colname, bootstrap_times = 10000, alpha = 0.05 )bootstrap_performance( ml_result, outcome_colname, bootstrap_times = 10000, alpha = 0.05 )
ml_result |
result returned from a single |
outcome_colname |
Column name as a string of the outcome variable
(default |
bootstrap_times |
the number of boostraps to create (default: |
alpha |
the alpha level for the confidence interval (default |
a data frame with an estimate (.estimate), lower bound (.lower),
and upper bound (.upper) for each performance metric (term).
Kelly Sovacool, [email protected]
bootstrap_performance(otu_mini_bin_results_glmnet, "dx", bootstrap_times = 10, alpha = 0.10 ) ## Not run: outcome_colname <- "dx" run_ml(otu_mini_bin, "rf", outcome_colname = "dx") %>% bootstrap_performance(outcome_colname, bootstrap_times = 10000, alpha = 0.05 ) ## End(Not run)bootstrap_performance(otu_mini_bin_results_glmnet, "dx", bootstrap_times = 10, alpha = 0.10 ) ## Not run: outcome_colname <- "dx" run_ml(otu_mini_bin, "rf", outcome_colname = "dx") %>% bootstrap_performance(outcome_colname, bootstrap_times = 10000, alpha = 0.05 ) ## End(Not run)
Implements Equation 1 from Wu et al. 2021 doi:10.1016/j.ajhg.2021.08.012.
It is the same as Equation 7 if AUPRC (aka prAUC) is used in place of precision.
calc_balanced_precision(precision, prior)calc_balanced_precision(precision, prior)
precision |
actual precision of the model. |
prior |
baseline precision, aka frequency of positives. Can be calculated with calc_baseline_precision |
the expected precision if the data were balanced
Kelly Sovacool [email protected]
prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) calc_balanced_precision(otu_mini_bin_results_rf$performance$Precision, prior) otu_mini_bin_results_rf$performance %>% dplyr::mutate( balanced_precision = calc_balanced_precision(Precision, prior), aubprc = calc_balanced_precision(prAUC, prior) ) %>% dplyr::select(AUC, Precision, balanced_precision, aubprc) # cumulative performance for a single model sensspec_1 <- calc_model_sensspec( otu_mini_bin_results_glmnet$trained_model, otu_mini_bin_results_glmnet$test_data, "dx" ) head(sensspec_1) prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) sensspec_1 %>% dplyr::mutate(balanced_precision = calc_balanced_precision(precision, prior)) %>% dplyr::rename(recall = sensitivity) %>% calc_mean_perf(group_var = recall, sum_var = balanced_precision) %>% plot_mean_prc(ycol = mean_balanced_precision)prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) calc_balanced_precision(otu_mini_bin_results_rf$performance$Precision, prior) otu_mini_bin_results_rf$performance %>% dplyr::mutate( balanced_precision = calc_balanced_precision(Precision, prior), aubprc = calc_balanced_precision(prAUC, prior) ) %>% dplyr::select(AUC, Precision, balanced_precision, aubprc) # cumulative performance for a single model sensspec_1 <- calc_model_sensspec( otu_mini_bin_results_glmnet$trained_model, otu_mini_bin_results_glmnet$test_data, "dx" ) head(sensspec_1) prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) sensspec_1 %>% dplyr::mutate(balanced_precision = calc_balanced_precision(precision, prior)) %>% dplyr::rename(recall = sensitivity) %>% calc_mean_perf(group_var = recall, sum_var = balanced_precision) %>% plot_mean_prc(ycol = mean_balanced_precision)
Calculate the fraction of positives, i.e. baseline precision for a PRC curve
calc_baseline_precision(dataset, outcome_colname = NULL, pos_outcome = NULL)calc_baseline_precision(dataset, outcome_colname = NULL, pos_outcome = NULL)
dataset |
Data frame with an outcome variable and other columns as
features. Alternatively, the input can be in |
outcome_colname |
Column name as a string of the outcome variable
(default |
pos_outcome |
the positive outcome from |
the baseline precision based on the fraction of positives
Kelly Sovacool, [email protected]
# calculate the baseline precision data.frame(y = c("a", "b", "a", "b")) %>% calc_baseline_precision( outcome_colname = "y", pos_outcome = "a" ) calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) # if you're not sure which outcome was used as the 'positive' outcome during # model training, you can access it from the trained model and pass it along: calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = otu_mini_bin_results_glmnet$trained_model$levels[1] )# calculate the baseline precision data.frame(y = c("a", "b", "a", "b")) %>% calc_baseline_precision( outcome_colname = "y", pos_outcome = "a" ) calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) # if you're not sure which outcome was used as the 'positive' outcome during # model training, you can access it from the trained model and pass it along: calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = otu_mini_bin_results_glmnet$trained_model$levels[1] )
Used by calc_mean_roc() and calc_mean_prc().
calc_mean_perf(sensspec_dat, group_var = specificity, sum_var = sensitivity)calc_mean_perf(sensspec_dat, group_var = specificity, sum_var = sensitivity)
sensspec_dat |
data frame created by concatenating results of
|
group_var |
variable to group by (e.g. specificity or recall). |
sum_var |
variable to summarize (e.g. sensitivity or precision). |
data frame with mean & standard deviation of sum_var summarized over group_var
Courtney Armour
Kelly Sovacool
Use these functions to calculate cumulative sensitivity, specificity, recall, etc. on single models, concatenate the results together from multiple models, and compute mean ROC and PRC. You can then plot mean ROC and PRC curves to visualize the results. Note: These functions assume a binary outcome.
calc_model_sensspec(trained_model, test_data, outcome_colname = NULL) calc_mean_roc(sensspec_dat) calc_mean_prc(sensspec_dat)calc_model_sensspec(trained_model, test_data, outcome_colname = NULL) calc_mean_roc(sensspec_dat) calc_mean_prc(sensspec_dat)
trained_model |
Trained model from |
test_data |
Held out test data: dataframe of outcome and features. |
outcome_colname |
Column name as a string of the outcome variable
(default |
sensspec_dat |
data frame created by concatenating results of
|
data frame with summarized performance
calc_model_sensspec(): Get sensitivity, specificity, and precision for a model.
calc_mean_roc(): Calculate mean sensitivity over specificity for multiple models
calc_mean_prc(): Calculate mean precision over recall for multiple models
Courtney Armour
Kelly Sovacool, [email protected]
## Not run: library(dplyr) # get cumulative performance for a single model sensspec_1 <- calc_model_sensspec( otu_mini_bin_results_glmnet$trained_model, otu_mini_bin_results_glmnet$test_data, "dx" ) head(sensspec_1) # get performance for multiple models get_sensspec_seed <- function(seed) { ml_result <- run_ml(otu_mini_bin, "glmnet", seed = seed) sensspec <- calc_model_sensspec( ml_result$trained_model, ml_result$test_data, "dx" ) %>% dplyr::mutate(seed = seed) return(sensspec) } sensspec_dat <- purrr::map_dfr(seq(100, 102), get_sensspec_seed) # calculate mean sensitivity over specificity roc_dat <- calc_mean_roc(sensspec_dat) head(roc_dat) # calculate mean precision over recall prc_dat <- calc_mean_prc(sensspec_dat) head(prc_dat) # plot ROC & PRC roc_dat %>% plot_mean_roc() baseline_prec <- calc_baseline_precision(otu_mini_bin, "dx", "cancer") prc_dat %>% plot_mean_prc(baseline_precision = baseline_prec) # balanced precision prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) bprc_dat <- sensspec_dat %>% dplyr::mutate(balanced_precision = calc_balanced_precision(precision, prior)) %>% dplyr::rename(recall = sensitivity) %>% calc_mean_perf(group_var = recall, sum_var = balanced_precision) bprc_dat %>% plot_mean_prc(ycol = mean_balanced_precision) + ylab("Mean Bal. Precision") ## End(Not run)## Not run: library(dplyr) # get cumulative performance for a single model sensspec_1 <- calc_model_sensspec( otu_mini_bin_results_glmnet$trained_model, otu_mini_bin_results_glmnet$test_data, "dx" ) head(sensspec_1) # get performance for multiple models get_sensspec_seed <- function(seed) { ml_result <- run_ml(otu_mini_bin, "glmnet", seed = seed) sensspec <- calc_model_sensspec( ml_result$trained_model, ml_result$test_data, "dx" ) %>% dplyr::mutate(seed = seed) return(sensspec) } sensspec_dat <- purrr::map_dfr(seq(100, 102), get_sensspec_seed) # calculate mean sensitivity over specificity roc_dat <- calc_mean_roc(sensspec_dat) head(roc_dat) # calculate mean precision over recall prc_dat <- calc_mean_prc(sensspec_dat) head(prc_dat) # plot ROC & PRC roc_dat %>% plot_mean_roc() baseline_prec <- calc_baseline_precision(otu_mini_bin, "dx", "cancer") prc_dat %>% plot_mean_prc(baseline_precision = baseline_prec) # balanced precision prior <- calc_baseline_precision(otu_mini_bin, outcome_colname = "dx", pos_outcome = "cancer" ) bprc_dat <- sensspec_dat %>% dplyr::mutate(balanced_precision = calc_balanced_precision(precision, prior)) %>% dplyr::rename(recall = sensitivity) %>% calc_mean_perf(group_var = recall, sum_var = balanced_precision) bprc_dat %>% plot_mean_prc(ycol = mean_balanced_precision) + ylab("Mean Bal. Precision") ## End(Not run)
Get performance metrics for test data
calc_perf_metrics( test_data, trained_model, outcome_colname, perf_metric_function, class_probs )calc_perf_metrics( test_data, trained_model, outcome_colname, perf_metric_function, class_probs )
test_data |
Held out test data: dataframe of outcome and features. |
trained_model |
Trained model from |
outcome_colname |
Column name as a string of the outcome variable
(default |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
class_probs |
Whether to use class probabilities (TRUE for categorical outcomes, FALSE for numeric outcomes). |
Dataframe of performance metrics.
Zena Lapp, [email protected]
## Not run: results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) calc_perf_metrics(results$test_data, results$trained_model, "dx", multiClassSummary, class_probs = TRUE ) ## End(Not run)## Not run: results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) calc_perf_metrics(results$test_data, results$trained_model, "dx", multiClassSummary, class_probs = TRUE ) ## End(Not run)
Combine hyperparameter performance metrics for multiple train/test splits generated by, for instance, looping in R or using a snakemake workflow on a high-performance computer.
combine_hp_performance(trained_model_lst)combine_hp_performance(trained_model_lst)
trained_model_lst |
List of trained models. |
Named list:
dat: Dataframe of performance metric for each group of hyperparameters
params: Hyperparameters tuned.
Metric: Performance metric used.
Zena Lapp, [email protected]
## Not run: results <- lapply(seq(100, 102), function(seed) { run_ml(otu_small, "glmnet", seed = seed, cv_times = 2, kfold = 2) }) models <- lapply(results, function(x) x$trained_model) combine_hp_performance(models) ## End(Not run)## Not run: results <- lapply(seq(100, 102), function(seed) { run_ml(otu_small, "glmnet", seed = seed, cv_times = 2, kfold = 2) }) models <- lapply(results, function(x) x$trained_model) combine_hp_performance(models) ## End(Not run)
A wrapper for permute_p_value().
compare_models(merged_data, metric, group_name, nperm = 10000)compare_models(merged_data, metric, group_name, nperm = 10000)
merged_data |
the concatenated performance data from |
metric |
metric to compare, must be numeric |
group_name |
column with group variables to compare |
nperm |
number of permutations, default=10000 |
a table of p-values for all pairs of group variable
Courtney R Armour, [email protected]
df <- dplyr::tibble( model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) ) set.seed(123) compare_models(df, "AUC", "model", nperm = 10)df <- dplyr::tibble( model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) ) set.seed(123) compare_models(df, "AUC", "model", nperm = 10)
Define cross-validation scheme and training parameters
define_cv( train_data, outcome_colname, hyperparams_list, perf_metric_function, class_probs, kfold = 5, cv_times = 100, groups = NULL, group_partitions = NULL )define_cv( train_data, outcome_colname, hyperparams_list, perf_metric_function, class_probs, kfold = 5, cv_times = 100, groups = NULL, group_partitions = NULL )
train_data |
Dataframe for training model. |
outcome_colname |
Column name as a string of the outcome variable
(default |
hyperparams_list |
Named list of lists of hyperparameters. |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
class_probs |
Whether to use class probabilities (TRUE for categorical outcomes, FALSE for numeric outcomes). |
kfold |
Fold number for k-fold cross-validation (default: |
cv_times |
Number of cross-validation partitions to create (default:
|
groups |
Vector of groups to keep together when splitting the data into
train and test sets. If the number of groups in the training set is larger
than |
group_partitions |
Specify how to assign |
Caret object for trainControl that controls cross-validation
Begüm Topçuoğlu, [email protected]
Kelly Sovacool, [email protected]
training_inds <- get_partition_indices(otu_small %>% dplyr::pull("dx"), training_frac = 0.8, groups = NULL ) train_data <- otu_small[training_inds, ] test_data <- otu_small[-training_inds, ] cv <- define_cv(train_data, outcome_colname = "dx", hyperparams_list = get_hyperparams_list(otu_small, "glmnet"), perf_metric_function = caret::multiClassSummary, class_probs = TRUE, kfold = 5 )training_inds <- get_partition_indices(otu_small %>% dplyr::pull("dx"), training_frac = 0.8, groups = NULL ) train_data <- otu_small[training_inds, ] test_data <- otu_small[-training_inds, ] cv <- define_cv(train_data, outcome_colname = "dx", hyperparams_list = get_hyperparams_list(otu_small, "glmnet"), perf_metric_function = caret::multiClassSummary, class_probs = TRUE, kfold = 5 )
Get preprocessed dataframe for continuous variables
get_caret_processed_df(features, method)get_caret_processed_df(features, method)
features |
Dataframe of features for machine learning |
method |
Methods to preprocess the data, described in
|
Named list:
processed: Dataframe of processed features.
removed: Names of any features removed during preprocessing.
Zena Lapp, [email protected]
get_caret_processed_df(mikropml::otu_small[, 2:ncol(otu_small)], c("center", "scale"))get_caret_processed_df(mikropml::otu_small[, 2:ncol(otu_small)], c("center", "scale"))
Calculates feature importance using a trained model and test data. Requires
the future.apply package.
get_feature_importance( trained_model, test_data, outcome_colname, perf_metric_function, perf_metric_name, class_probs, method, seed = NA, corr_thresh = 1, groups = NULL, nperms = 100, corr_method = "spearman" )get_feature_importance( trained_model, test_data, outcome_colname, perf_metric_function, perf_metric_name, class_probs, method, seed = NA, corr_thresh = 1, groups = NULL, nperms = 100, corr_method = "spearman" )
trained_model |
Trained model from |
test_data |
Held out test data: dataframe of outcome and features. |
outcome_colname |
Column name as a string of the outcome variable
(default |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
perf_metric_name |
The column name from the output of the function
provided to perf_metric_function that is to be used as the performance
metric. Defaults: binary classification = |
class_probs |
Whether to use class probabilities (TRUE for categorical outcomes, FALSE for numeric outcomes). |
method |
ML method. Options:
|
seed |
Random seed (default: |
corr_thresh |
For feature importance, group correlations above or equal
to |
groups |
Vector of feature names to group together during permutation.
Each element should be a string with feature names separated by a pipe
character ( |
nperms |
number of permutations to perform (default: |
corr_method |
Correlation method. Options are the same as those supported
by |
For permutation tests, the p-value is the number of permutation statistics that are greater than the test statistic, divided by the number of permutations. In our case, the permutation statistic is the model performance (e.g. AUROC) after randomizing the order of observations for one feature, and the test statistic is the actual performance on the test data. By default we perform 100 permutations per feature; increasing this will increase the precision of estimating the null distribution, but also increases runtime. The p-value represents the probability of obtaining the actual performance in the event that the null hypothesis is true, where the null hypothesis is that the feature is not important for model performance.
We strongly recommend providing multiple cores to speed up computation time. See our vignette on parallel processing for more details.
Data frame with performance metrics for when each feature (or group
of correlated features; feat) is permuted (perf_metric), differences
between the actual test performance metric on and the permuted performance
metric (perf_metric_diff; test minus permuted performance), and the
p-value (pvalue: the probability of obtaining the actual performance
value under the null hypothesis). Features with a larger perf_metric_diff
are more important. The performance metric name (perf_metric_name) and
seed (seed) are also returned.
Begüm Topçuoğlu, [email protected]
Zena Lapp, [email protected]
Kelly Sovacool, [email protected]
## Not run: # If you called `run_ml()` with `feature_importance = FALSE` (the default), # you can use `get_feature_importance()` later as long as you have the # trained model and test data. results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) # We strongly recommend providing multiple cores to speed up computation time. # Do this before calling `get_feature_importance()`. doFuture::registerDoFuture() future::plan(future::multicore, workers = 2) # Optionally, you can group features together with a custom grouping feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet", groups = c( "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012", "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022", "Otu00023", "Otu00025", "Otu00028", "Otu00029", "Otu00030", "Otu00035", "Otu00036", "Otu00037", "Otu00038", "Otu00039", "Otu00040", "Otu00047", "Otu00050", "Otu00052", "Otu00054", "Otu00055", "Otu00056", "Otu00060", "Otu00003|Otu00002|Otu00005|Otu00024|Otu00032|Otu00041|Otu00053", "Otu00014|Otu00021|Otu00017|Otu00031|Otu00057", "Otu00013|Otu00006", "Otu00026|Otu00001|Otu00034|Otu00048", "Otu00033|Otu00010", "Otu00042|Otu00004", "Otu00043|Otu00027|Otu00049", "Otu00051|Otu00045", "Otu00058|Otu00044", "Otu00059|Otu00046" ) ) # the function can show a progress bar if you have the `progressr` package installed. ## optionally, specify the progress bar format: progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, show_after = 0 )) ## tell progressr to always report progress progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) # You can specify any correlation method supported by `stats::cor`: feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet", corr_method = "pearson" ) ## End(Not run)## Not run: # If you called `run_ml()` with `feature_importance = FALSE` (the default), # you can use `get_feature_importance()` later as long as you have the # trained model and test data. results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) # We strongly recommend providing multiple cores to speed up computation time. # Do this before calling `get_feature_importance()`. doFuture::registerDoFuture() future::plan(future::multicore, workers = 2) # Optionally, you can group features together with a custom grouping feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet", groups = c( "Otu00007", "Otu00008", "Otu00009", "Otu00011", "Otu00012", "Otu00015", "Otu00016", "Otu00018", "Otu00019", "Otu00020", "Otu00022", "Otu00023", "Otu00025", "Otu00028", "Otu00029", "Otu00030", "Otu00035", "Otu00036", "Otu00037", "Otu00038", "Otu00039", "Otu00040", "Otu00047", "Otu00050", "Otu00052", "Otu00054", "Otu00055", "Otu00056", "Otu00060", "Otu00003|Otu00002|Otu00005|Otu00024|Otu00032|Otu00041|Otu00053", "Otu00014|Otu00021|Otu00017|Otu00031|Otu00057", "Otu00013|Otu00006", "Otu00026|Otu00001|Otu00034|Otu00048", "Otu00033|Otu00010", "Otu00042|Otu00004", "Otu00043|Otu00027|Otu00049", "Otu00051|Otu00045", "Otu00058|Otu00044", "Otu00059|Otu00046" ) ) # the function can show a progress bar if you have the `progressr` package installed. ## optionally, specify the progress bar format: progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, show_after = 0 )) ## tell progressr to always report progress progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) # You can specify any correlation method supported by `stats::cor`: feat_imp <- get_feature_importance(results$trained_model, results$trained_model$trainingData, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet", corr_method = "pearson" ) ## End(Not run)
Get hyperparameter performance metrics
get_hp_performance(trained_model)get_hp_performance(trained_model)
trained_model |
trained model (e.g. from |
Named list:
dat: Dataframe of performance metric for each group of hyperparameters.
params: Hyperparameters tuned.
metric: Performance metric used.
Zena Lapp, [email protected]
Kelly Sovacool [email protected]
get_hp_performance(otu_mini_bin_results_glmnet$trained_model)get_hp_performance(otu_mini_bin_results_glmnet$trained_model)
For more details see the vignette on hyperparameter tuning.
get_hyperparams_list(dataset, method)get_hyperparams_list(dataset, method)
dataset |
Data frame with an outcome variable and other columns as
features. Alternatively, the input can be in |
method |
ML method. Options:
|
Named list of hyperparameters.
Kelly Sovacool, [email protected]
get_hyperparams_list(otu_mini_bin, "rf") get_hyperparams_list(otu_small, "rf") get_hyperparams_list(otu_mini_bin, "rpart2") get_hyperparams_list(otu_small, "rpart2")get_hyperparams_list(otu_mini_bin, "rf") get_hyperparams_list(otu_small, "rf") get_hyperparams_list(otu_mini_bin, "rpart2") get_hyperparams_list(otu_small, "rpart2")
If the outcome is numeric, the type is continuous. Otherwise, the outcome type is binary if there are only two outcomes or multiclass if there are more than two outcomes.
get_outcome_type(outcomes_vec)get_outcome_type(outcomes_vec)
outcomes_vec |
Vector of outcomes. |
Outcome type (continuous, binary, or multiclass).
Zena Lapp, [email protected]
get_outcome_type(c(1, 2, 1)) get_outcome_type(c("a", "b", "b")) get_outcome_type(c("a", "b", "c"))get_outcome_type(c(1, 2, 1)) get_outcome_type(c("a", "b", "b")) get_outcome_type(c("a", "b", "c"))
Use this function to get the row indices for the training set.
get_partition_indices( outcomes, training_frac = 0.8, groups = NULL, group_partitions = NULL )get_partition_indices( outcomes, training_frac = 0.8, groups = NULL, group_partitions = NULL )
outcomes |
vector of outcomes |
training_frac |
Fraction of data for training set (default: |
groups |
Vector of groups to keep together when splitting the data into
train and test sets. If the number of groups in the training set is larger
than |
group_partitions |
Specify how to assign |
If groups is NULL, uses createDataPartition.
Otherwise, uses create_grouped_data_partition().
Set the seed prior to calling this function if you would like your data partitions to be reproducible (recommended).
Vector of row indices for the training set.
Kelly Sovacool, [email protected]
training_inds <- get_partition_indices(otu_mini_bin$dx) train_data <- otu_mini_bin[training_inds, ] test_data <- otu_mini_bin[-training_inds, ]training_inds <- get_partition_indices(otu_mini_bin$dx) train_data <- otu_mini_bin[training_inds, ] test_data <- otu_mini_bin[-training_inds, ]
Get default performance metric function
get_perf_metric_fn(outcome_type)get_perf_metric_fn(outcome_type)
outcome_type |
Type of outcome (one of: |
Performance metric function.
Zena Lapp, [email protected]
get_perf_metric_fn("continuous") get_perf_metric_fn("binary") get_perf_metric_fn("multiclass")get_perf_metric_fn("continuous") get_perf_metric_fn("binary") get_perf_metric_fn("multiclass")
Get default performance metric name for cross-validation.
get_perf_metric_name(outcome_type)get_perf_metric_name(outcome_type)
outcome_type |
Type of outcome (one of: |
Performance metric name.
Zena Lapp, [email protected]
get_perf_metric_name("continuous") get_perf_metric_name("binary") get_perf_metric_name("multiclass")get_perf_metric_name("continuous") get_perf_metric_name("binary") get_perf_metric_name("multiclass")
Get model performance metrics as a one-row tibble
get_performance_tbl( trained_model, test_data, outcome_colname, perf_metric_function, perf_metric_name, class_probs, method, seed = NA )get_performance_tbl( trained_model, test_data, outcome_colname, perf_metric_function, perf_metric_name, class_probs, method, seed = NA )
trained_model |
Trained model from |
test_data |
Held out test data: dataframe of outcome and features. |
outcome_colname |
Column name as a string of the outcome variable
(default |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
perf_metric_name |
The column name from the output of the function
provided to perf_metric_function that is to be used as the performance
metric. Defaults: binary classification = |
class_probs |
Whether to use class probabilities (TRUE for categorical outcomes, FALSE for numeric outcomes). |
method |
ML method. Options:
|
seed |
Random seed (default: |
A one-row tibble with a column for the cross-validation performance,
columns for each of the performance metrics for the test data,
plus the method, and seed.
Kelly Sovacool, [email protected]
Zena Lapp, [email protected]
## Not run: results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" get_performance_tbl(results$trained_model, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) ## End(Not run)## Not run: results <- run_ml(otu_small, "glmnet", kfold = 2, cv_times = 2) names(results$trained_model$trainingData)[1] <- "dx" get_performance_tbl(results$trained_model, results$test_data, "dx", multiClassSummary, "AUC", class_probs = TRUE, method = "glmnet" ) ## End(Not run)
Generate the tuning grid for tuning hyperparameters
get_tuning_grid(hyperparams_list, method)get_tuning_grid(hyperparams_list, method)
hyperparams_list |
Named list of lists of hyperparameters. |
method |
ML method. Options:
|
The tuning grid.
Begüm Topçuoğlu, [email protected]
Kelly Sovacool, [email protected]
ml_method <- "glmnet" hparams_list <- get_hyperparams_list(otu_small, ml_method) get_tuning_grid(hparams_list, ml_method)ml_method <- "glmnet" hparams_list <- get_hyperparams_list(otu_small, ml_method) get_tuning_grid(hparams_list, ml_method)
This is the result of running preprocess_data("otu_mini_bin")
otu_data_preprocotu_data_preproc
An object of class list of length 3.
A dataset containing relatives abundances of OTUs for human stool samples
with a binary outcome, dx.
This is a subset of otu_small.
otu_mini_binotu_mini_bin
A data frame
The dx column is the diagnosis: healthy or cancerous (colorectal).
All other columns are OTU relative abundances.
otu_mini_bin with feature importance and groupingResults from running the pipeline with L2 logistic regression on otu_mini_bin with feature importance and grouping
otu_mini_bin_results_glmnetotu_mini_bin_results_glmnet
An object of class list of length 4.
otu_mini_bin
Results from running the pipeline with random forest on otu_mini_bin
otu_mini_bin_results_rfotu_mini_bin_results_rf
An object of class list of length 4.
otu_mini_bin
Results from running the pipeline with rpart2 on otu_mini_bin
otu_mini_bin_results_rpart2otu_mini_bin_results_rpart2
An object of class list of length 4.
otu_mini_bin
Results from running the pipeline with svmRadial on otu_mini_bin
otu_mini_bin_results_svmRadialotu_mini_bin_results_svmRadial
An object of class list of length 4.
otu_mini_bin
Results from running the pipeline with xbgTree on otu_mini_bin
otu_mini_bin_results_xgbTreeotu_mini_bin_results_xgbTree
An object of class list of length 4.
otu_mini_bin with Otu00001
as the outcomeResults from running the pipeline with glmnet on otu_mini_bin with Otu00001
as the outcome
otu_mini_cont_results_glmnetotu_mini_cont_results_glmnet
An object of class list of length 4.
otu_mini_bin with Otu00001
as the outcome column,
using a custom train control scheme that does not perform cross-validationResults from running the pipeline with glmnet on otu_mini_bin with Otu00001
as the outcome column,
using a custom train control scheme that does not perform cross-validation
otu_mini_cont_results_nocvotu_mini_cont_results_nocv
An object of class list of length 4.
train_data_mini with grouped features.Cross validation on train_data_mini with grouped features.
otu_mini_cvotu_mini_cv
An object of class list of length 27.
A dataset containing relatives abundances of OTUs for human stool samples
otu_mini_multiotu_mini_multi
A data frame
The dx column is the colorectal cancer diagnosis: adenoma, carcinoma, normal.
All other columns are OTU relative abundances.
Groups for otu_mini_multi
otu_mini_multi_groupotu_mini_multi_group
An object of class character of length 490.
otu_mini_multi for
multiclass outcomesResults from running the pipeline with glmnet on otu_mini_multi for
multiclass outcomes
otu_mini_multi_results_glmnetotu_mini_multi_results_glmnet
An object of class list of length 4.
A dataset containing relatives abundances of 60 OTUs for 60 human stool samples.
This is a subset of the data provided in extdata/otu_large.csv, which was
used in Topçuoğlu et al. 2020.
otu_smallotu_small
A data frame with 60 rows and 61 variables.
The dx column is the diagnosis: healthy or cancerous (colorectal).
All other columns are OTU relative abundances.
Calculated a permuted p-value comparing two models
permute_p_value( merged_data, metric, group_name, group_1, group_2, nperm = 10000 )permute_p_value( merged_data, metric, group_name, group_1, group_2, nperm = 10000 )
merged_data |
the concatenated performance data from |
metric |
metric to compare, must be numeric |
group_name |
column with group variables to compare |
group_1 |
name of one group to compare |
group_2 |
name of other group to compare |
nperm |
number of permutations, default=10000 |
numeric p-value comparing two models
Begüm Topçuoğlu, [email protected]
Courtney R Armour, [email protected]
df <- dplyr::tibble( model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) ) set.seed(123) permute_p_value(df, "AUC", "model", "rf", "glmnet", nperm = 100)df <- dplyr::tibble( model = c("rf", "rf", "glmnet", "glmnet", "svmRadial", "svmRadial"), AUC = c(.2, 0.3, 0.8, 0.9, 0.85, 0.95) ) set.seed(123) permute_p_value(df, "AUC", "model", "rf", "glmnet", nperm = 100)
Plot hyperparameter performance metrics
plot_hp_performance(dat, param_col, metric_col)plot_hp_performance(dat, param_col, metric_col)
dat |
dataframe of hyperparameters and performance metric (e.g. from |
param_col |
hyperparameter to be plotted. must be a column in |
metric_col |
performance metric. must be a column in |
ggplot of hyperparameter performance.
Zena Lapp, [email protected]
Kelly Sovacool [email protected]
# plot for a single `run_ml()` call hp_metrics <- get_hp_performance(otu_mini_bin_results_glmnet$trained_model) hp_metrics plot_hp_performance(hp_metrics$dat, lambda, AUC) ## Not run: # plot for multiple `run_ml()` calls results <- lapply(seq(100, 102), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) models <- lapply(results, function(x) x$trained_model) hp_metrics <- combine_hp_performance(models) plot_hp_performance(hp_metrics$dat, lambda, AUC) ## End(Not run)# plot for a single `run_ml()` call hp_metrics <- get_hp_performance(otu_mini_bin_results_glmnet$trained_model) hp_metrics plot_hp_performance(hp_metrics$dat, lambda, AUC) ## Not run: # plot for multiple `run_ml()` calls results <- lapply(seq(100, 102), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) models <- lapply(results, function(x) x$trained_model) hp_metrics <- combine_hp_performance(models) plot_hp_performance(hp_metrics$dat, lambda, AUC) ## End(Not run)
Plot ROC and PRC curves
plot_mean_roc(dat, ribbon_fill = "#C6DBEF", line_color = "#08306B") plot_mean_prc( dat, baseline_precision = NULL, ycol = mean_precision, ribbon_fill = "#C7E9C0", line_color = "#00441B" )plot_mean_roc(dat, ribbon_fill = "#C6DBEF", line_color = "#08306B") plot_mean_prc( dat, baseline_precision = NULL, ycol = mean_precision, ribbon_fill = "#C7E9C0", line_color = "#00441B" )
dat |
sensitivity, specificity, and precision data calculated by |
ribbon_fill |
ribbon fill color (default: "#D9D9D9") |
line_color |
line color (default: "#000000") |
baseline_precision |
baseline precision from |
ycol |
column for the y axis (Default: |
plot_mean_roc(): Plot mean sensitivity over specificity
plot_mean_prc(): Plot mean precision over recall
Courtney Armour
Kelly Sovacool [email protected]
## Not run: library(dplyr) # get performance for multiple models get_sensspec_seed <- function(seed) { ml_result <- run_ml(otu_mini_bin, "glmnet", seed = seed) sensspec <- calc_model_sensspec( ml_result$trained_model, ml_result$test_data, "dx" ) %>% mutate(seed = seed) return(sensspec) } sensspec_dat <- purrr::map_dfr(seq(100, 102), get_sensspec_seed) # plot ROC & PRC sensspec_dat %>% calc_mean_roc() %>% plot_mean_roc() baseline_prec <- calc_baseline_precision(otu_mini_bin, "dx", "cancer") sensspec_dat %>% calc_mean_prc() %>% plot_mean_prc(baseline_precision = baseline_prec) ## End(Not run)## Not run: library(dplyr) # get performance for multiple models get_sensspec_seed <- function(seed) { ml_result <- run_ml(otu_mini_bin, "glmnet", seed = seed) sensspec <- calc_model_sensspec( ml_result$trained_model, ml_result$test_data, "dx" ) %>% mutate(seed = seed) return(sensspec) } sensspec_dat <- purrr::map_dfr(seq(100, 102), get_sensspec_seed) # plot ROC & PRC sensspec_dat %>% calc_mean_roc() %>% plot_mean_roc() baseline_prec <- calc_baseline_precision(otu_mini_bin, "dx", "cancer") sensspec_dat %>% calc_mean_prc() %>% plot_mean_prc(baseline_precision = baseline_prec) ## End(Not run)
ggplot2 is required to use this function.
plot_model_performance(performance_df)plot_model_performance(performance_df)
performance_df |
dataframe of performance results from multiple calls to |
A ggplot2 plot of performance.
Begüm Topçuoglu, [email protected]
Kelly Sovacool, [email protected]
## Not run: # call `run_ml()` multiple times with different seeds results_lst <- lapply(seq(100, 104), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) # extract and combine the performance results perf_df <- lapply(results_lst, function(result) { result[["performance"]] }) %>% dplyr::bind_rows() # plot the performance results p <- plot_model_performance(perf_df) # call `run_ml()` with different ML methods param_grid <- expand.grid( seeds = seq(100, 104), methods = c("glmnet", "rf") ) results_mtx <- mapply( function(seed, method) { run_ml(otu_mini_bin, method, seed = seed, kfold = 2) }, param_grid$seeds, param_grid$methods ) # extract and combine the performance results perf_df2 <- dplyr::bind_rows(results_mtx["performance", ]) # plot the performance results p <- plot_model_performance(perf_df2) # you can continue adding layers to customize the plot p + theme_classic() + scale_color_brewer(palette = "Dark2") + coord_flip() ## End(Not run)## Not run: # call `run_ml()` multiple times with different seeds results_lst <- lapply(seq(100, 104), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) # extract and combine the performance results perf_df <- lapply(results_lst, function(result) { result[["performance"]] }) %>% dplyr::bind_rows() # plot the performance results p <- plot_model_performance(perf_df) # call `run_ml()` with different ML methods param_grid <- expand.grid( seeds = seq(100, 104), methods = c("glmnet", "rf") ) results_mtx <- mapply( function(seed, method) { run_ml(otu_mini_bin, method, seed = seed, kfold = 2) }, param_grid$seeds, param_grid$methods ) # extract and combine the performance results perf_df2 <- dplyr::bind_rows(results_mtx["performance", ]) # plot the performance results p <- plot_model_performance(perf_df2) # you can continue adding layers to customize the plot p + theme_classic() + scale_color_brewer(palette = "Dark2") + coord_flip() ## End(Not run)
Function to preprocess your data for input into run_ml().
preprocess_data(dataset, ...) ## S4 method for signature 'TreeSummarizedExperiment' preprocess_data( dataset, outcome_colname, assay.type = "counts", col.var = NULL, altexp = NULL, name = "preprocessed", ... ) ## S4 method for signature 'ANY' preprocess_data( dataset, outcome_colname, method = c("center", "scale"), remove_var = "nzv", collapse_corr_feats = TRUE, corr_method = "spearman", corr_thresh = 1, to_numeric = TRUE, group_neg_corr = TRUE, prefilter_threshold = 1, ... )preprocess_data(dataset, ...) ## S4 method for signature 'TreeSummarizedExperiment' preprocess_data( dataset, outcome_colname, assay.type = "counts", col.var = NULL, altexp = NULL, name = "preprocessed", ... ) ## S4 method for signature 'ANY' preprocess_data( dataset, outcome_colname, method = c("center", "scale"), remove_var = "nzv", collapse_corr_feats = TRUE, corr_method = "spearman", corr_thresh = 1, to_numeric = TRUE, group_neg_corr = TRUE, prefilter_threshold = 1, ... )
dataset |
Data frame with an outcome variable and other columns as
features. Alternatively, the input can be in |
... |
All additional arguments are passed on to |
outcome_colname |
Column name as a string of the outcome variable
(default |
assay.type |
The name of assay from |
col.var |
The name of sample matdata variables from |
altexp |
The name of alternative experiment ( |
name |
Name of results used when the input is
|
method |
Methods to preprocess the data, described in
|
remove_var |
Whether to remove variables with near-zero variance
( |
collapse_corr_feats |
Whether to keep only one of correlated features
(see |
corr_method |
Correlation method. Options are the same as those supported
by |
corr_thresh |
group correlations above or equal to |
to_numeric |
Whether to change features to numeric where possible. |
group_neg_corr |
Whether to group negatively correlated features together (e.g. c(0,1) and c(1,0)). |
prefilter_threshold |
Remove features which only have non-zero & non-NA
values in N rows or fewer (default: 1). Set this to -1 to keep all columns
at this step. This step will also be skipped if |
Named list including:
dat_transformed: Preprocessed data.
grp_feats: If features were grouped together, a named list of the features corresponding to each group.
removed_feats: Any features that were removed during preprocessing (e.g. because there was zero variance or near-zero variance for those features).
If the input is TreeSummarizedExperiment, the output is added as an
additional data to the input object. If the set of features match in output
and input, the results are stored directly to assay slot. If they
do not match, the output is stored to altExp slot of the object.
If the progressr package is installed, a progress bar with time elapsed
and estimated time to completion can be displayed.
See the preprocessing vignette for more details.
Note that if any values in outcome_colname contain spaces, they will be
converted to underscores for compatibility with caret.
Zena Lapp, [email protected]
Kelly Sovacool, [email protected]
preprocess_data(mikropml::otu_small, "dx") # the function can show a progress bar if you have the progressr package installed ## optionally, specify the progress bar format progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, show_after = 0 )) ## tell progressor to always report progress ## Not run: progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates dat_preproc <- preprocess_data(mikropml::otu_small, "dx") # Create TreeSE object library(TreeSummarizedExperiment) df <- mikropml::otu_small assay <- df[, !colnames(df) %in% c("dx"), drop = FALSE] |> t() |> as.matrix() tse <- TreeSummarizedExperiment(assays = SimpleList(counts = assay)) colData(tse)[["dx"]] <- df[["dx"]] # Preprocess tse <- preprocess_data( dataset = tse, assay.type = "counts", outcome_colname = "dx" ) # The result is in assay slot tse ## End(Not run)preprocess_data(mikropml::otu_small, "dx") # the function can show a progress bar if you have the progressr package installed ## optionally, specify the progress bar format progressr::handlers(progressr::handler_progress( format = ":message :bar :percent | elapsed: :elapsed | eta: :eta", clear = FALSE, show_after = 0 )) ## tell progressor to always report progress ## Not run: progressr::handlers(global = TRUE) ## run the function and watch the live progress udpates dat_preproc <- preprocess_data(mikropml::otu_small, "dx") # Create TreeSE object library(TreeSummarizedExperiment) df <- mikropml::otu_small assay <- df[, !colnames(df) %in% c("dx"), drop = FALSE] |> t() |> as.matrix() tse <- TreeSummarizedExperiment(assays = SimpleList(counts = assay)) colData(tse)[["dx"]] <- df[["dx"]] # Preprocess tse <- preprocess_data( dataset = tse, assay.type = "counts", outcome_colname = "dx" ) # The result is in assay slot tse ## End(Not run)
Randomize feature order to eliminate any position-dependent effects
randomize_feature_order(dataset, outcome_colname)randomize_feature_order(dataset, outcome_colname)
dataset |
Data frame with an outcome variable and other columns as
features. Alternatively, the input can be in |
outcome_colname |
Column name as a string of the outcome variable
(default |
Dataset with feature order randomized.
Nick Lesniak, [email protected]
Kelly Sovacool, [email protected]
dat <- data.frame( outcome = c("1", "2", "3"), a = 4:6, b = 7:9, c = 10:12, d = 13:15 ) randomize_feature_order(dat, "outcome")dat <- data.frame( outcome = c("1", "2", "3"), a = 4:6, b = 7:9, c = 10:12, d = 13:15 ) randomize_feature_order(dat, "outcome")
threshold row(s) or fewer.Removes columns which only have non-zero & non-NA values in threshold row(s) or fewer.
remove_singleton_columns(dat, threshold = 1)remove_singleton_columns(dat, threshold = 1)
dat |
dataframe |
threshold |
Number of rows. If a column only has non-zero & non-NA values
in |
dataframe without singleton columns
Kelly Sovacool, [email protected]
Courtney Armour
remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, 0), c = 4:6)) remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, 0), c = 4:6), threshold = 0) remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, NA), c = 4:6)) remove_singleton_columns(data.frame(a = 1:3, b = c(1, 1, 1), c = 4:6))remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, 0), c = 4:6)) remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, 0), c = 4:6), threshold = 0) remove_singleton_columns(data.frame(a = 1:3, b = c(0, 1, NA), c = 4:6)) remove_singleton_columns(data.frame(a = 1:3, b = c(1, 1, 1), c = 4:6))
Replace spaces in all elements of a character vector with underscores
replace_spaces(x, new_char = "_")replace_spaces(x, new_char = "_")
x |
a character vector |
new_char |
the character to replace spaces (default: |
character vector with all spaces replaced with new_char
Kelly Sovacool, [email protected]
dat <- data.frame( dx = c("outcome 1", "outcome 2", "outcome 1"), a = 1:3, b = c(5, 7, 1) ) dat$dx <- replace_spaces(dat$dx) datdat <- data.frame( dx = c("outcome 1", "outcome 2", "outcome 1"), a = 1:3, b = c(5, 7, 1) ) dat$dx <- replace_spaces(dat$dx) dat
This function splits the data set into a train & test set,
trains machine learning (ML) models using k-fold cross-validation,
evaluates the best model on the held-out test set,
and optionally calculates feature importance using the framework
outlined in Topçuoğlu et al. 2020 (doi:10.1128/mBio.00434-20).
Required inputs are a data frame (must contain an outcome variable and all
other columns as features) and the ML method.
See vignette('introduction') for more details.
run_ml(dataset, ...) ## S4 method for signature 'TreeSummarizedExperiment' run_ml( dataset, method, outcome_colname, assay.type = "counts", col.var = NULL, altexp = NULL, ... ) ## S4 method for signature 'ANY' run_ml( dataset, method, outcome_colname = NULL, hyperparameters = NULL, find_feature_importance = FALSE, calculate_performance = TRUE, kfold = 5, cv_times = 100, cross_val = NULL, training_frac = 0.8, perf_metric_function = NULL, perf_metric_name = NULL, groups = NULL, group_partitions = NULL, corr_thresh = 1, seed = NA, ... )run_ml(dataset, ...) ## S4 method for signature 'TreeSummarizedExperiment' run_ml( dataset, method, outcome_colname, assay.type = "counts", col.var = NULL, altexp = NULL, ... ) ## S4 method for signature 'ANY' run_ml( dataset, method, outcome_colname = NULL, hyperparameters = NULL, find_feature_importance = FALSE, calculate_performance = TRUE, kfold = 5, cv_times = 100, cross_val = NULL, training_frac = 0.8, perf_metric_function = NULL, perf_metric_name = NULL, groups = NULL, group_partitions = NULL, corr_thresh = 1, seed = NA, ... )
dataset |
Data frame with an outcome variable and other columns as
features. Alternatively, the input can be in |
... |
All additional arguments are passed on to |
method |
ML method. Options:
|
outcome_colname |
Column name as a string of the outcome variable
(default |
assay.type |
The name of assay from |
col.var |
The name of sample matdata variables from |
altexp |
The name of alternative experiment ( |
hyperparameters |
Dataframe of hyperparameters (default |
find_feature_importance |
Run permutation importance (default: |
calculate_performance |
Whether to calculate performance metrics
(default: |
kfold |
Fold number for k-fold cross-validation (default: |
cv_times |
Number of cross-validation partitions to create (default:
|
cross_val |
a custom cross-validation scheme from
|
training_frac |
Fraction of data for training set (default: |
perf_metric_function |
Function to calculate the performance metric to
be used for cross-validation and test performance. Some functions are
provided by caret (see |
perf_metric_name |
The column name from the output of the function
provided to perf_metric_function that is to be used as the performance
metric. Defaults: binary classification = |
groups |
Vector of groups to keep together when splitting the data into
train and test sets. If the number of groups in the training set is larger
than |
group_partitions |
Specify how to assign |
corr_thresh |
For feature importance, group correlations above or equal
to |
seed |
Random seed (default: |
Named list with results:
trained_model: Output of caret::train(), including the best model.
test_data: Part of the data that was used for testing.
performance: Data frame of performance metrics. The first column is the
cross-validation performance metric, and the last two columns are the ML
method used and the seed (if one was set), respectively.
All other columns are performance metrics calculated on the test data.
This contains only one row, so you can easily combine performance
data frames from multiple calls to run_ml()
(see vignette("parallel")).
feature_importance: If feature importances were calculated, a data frame
where each row is a feature or correlated group. The columns are the
performance metric of the permuted data, the difference between the true
performance metric and the performance metric of the permuted data
(true - permuted), the feature name, the ML method,
the performance metric name, and the seed (if provided).
For AUC and RMSE, the higher perf_metric_diff is, the more important that
feature is for predicting the outcome. For log loss, the lower
perf_metric_diff is, the more important that feature is for
predicting the outcome.
For more details, please see the vignettes.
Begüm Topçuoğlu, [email protected]
Zena Lapp, [email protected]
Kelly Sovacool, [email protected]
## Not run: # regression run_ml(otu_small, "glmnet", seed = 2019 ) # random forest w/ feature importance run_ml(otu_small, "rf", outcome_colname = "dx", find_feature_importance = TRUE ) # custom cross validation & hyperparameters run_ml(otu_mini_bin[, 2:11], "glmnet", outcome_colname = "Otu00001", seed = 2019, hyperparameters = list(lambda = c(1e-04), alpha = 0), cross_val = caret::trainControl(method = "none"), calculate_performance = FALSE ) # Create TreeSE dataset library(TreeSummarizedExperiment) df <- mikropml::otu_small assay <- df[, !colnames(df) %in% c("dx"), drop = FALSE] |> t() |> as.matrix() tse <- TreeSummarizedExperiment(assays = SimpleList(counts = assay)) colData(tse)[["dx"]] <- otu_mini_multi[["dx"]] # Train model res <- run_ml( tse, assay.type = "counts", method = "rf", outcome_colname = "dx" ) ## End(Not run)## Not run: # regression run_ml(otu_small, "glmnet", seed = 2019 ) # random forest w/ feature importance run_ml(otu_small, "rf", outcome_colname = "dx", find_feature_importance = TRUE ) # custom cross validation & hyperparameters run_ml(otu_mini_bin[, 2:11], "glmnet", outcome_colname = "Otu00001", seed = 2019, hyperparameters = list(lambda = c(1e-04), alpha = 0), cross_val = caret::trainControl(method = "none"), calculate_performance = FALSE ) # Create TreeSE dataset library(TreeSummarizedExperiment) df <- mikropml::otu_small assay <- df[, !colnames(df) %in% c("dx"), drop = FALSE] |> t() |> as.matrix() tse <- TreeSummarizedExperiment(assays = SimpleList(counts = assay)) colData(tse)[["dx"]] <- otu_mini_multi[["dx"]] # Train model res <- run_ml( tse, assay.type = "counts", method = "rf", outcome_colname = "dx" ) ## End(Not run)
Used by plot_model_performance().
tidy_perf_data(performance_df)tidy_perf_data(performance_df)
performance_df |
dataframe of performance results from multiple calls to |
Tidy dataframe with model performance metrics.
Begüm Topçuoglu, [email protected]
Kelly Sovacool, [email protected]
## Not run: # call `run_ml()` multiple times with different seeds results_lst <- lapply(seq(100, 104), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) # extract and combine the performance results perf_df <- lapply(results_lst, function(result) { result[["performance"]] }) %>% dplyr::bind_rows() # make it pretty! tidy_perf_data(perf_df) ## End(Not run)## Not run: # call `run_ml()` multiple times with different seeds results_lst <- lapply(seq(100, 104), function(seed) { run_ml(otu_small, "glmnet", seed = seed) }) # extract and combine the performance results perf_df <- lapply(results_lst, function(result) { result[["performance"]] }) %>% dplyr::bind_rows() # make it pretty! tidy_perf_data(perf_df) ## End(Not run)
caret::train().Train model using caret::train().
train_model( train_data, outcome_colname, method, cv, perf_metric_name, tune_grid, ... )train_model( train_data, outcome_colname, method, cv, perf_metric_name, tune_grid, ... )
train_data |
Training data. Expected to be a subset of the full dataset. |
outcome_colname |
Column name as a string of the outcome variable
(default |
method |
ML method. Options:
|
cv |
Cross-validation caret scheme from |
perf_metric_name |
The column name from the output of the function
provided to perf_metric_function that is to be used as the performance
metric. Defaults: binary classification = |
tune_grid |
Tuning grid from |
... |
All additional arguments are passed on to |
Trained model from caret::train().
Zena Lapp, [email protected]
## Not run: training_data <- otu_mini_bin_results_glmnet$trained_model$trainingData %>% dplyr::rename(dx = .outcome) method <- "rf" hyperparameters <- get_hyperparams_list(otu_mini_bin, method) cross_val <- define_cv(training_data, "dx", hyperparameters, perf_metric_function = caret::multiClassSummary, class_probs = TRUE, cv_times = 2 ) tune_grid <- get_tuning_grid(hyperparameters, method) rf_model <- train_model( training_data, "dx", method, cross_val, "AUC", tune_grid, ntree = 1000 ) rf_model$results %>% dplyr::select(mtry, AUC, prAUC) ## End(Not run)## Not run: training_data <- otu_mini_bin_results_glmnet$trained_model$trainingData %>% dplyr::rename(dx = .outcome) method <- "rf" hyperparameters <- get_hyperparams_list(otu_mini_bin, method) cross_val <- define_cv(training_data, "dx", hyperparameters, perf_metric_function = caret::multiClassSummary, class_probs = TRUE, cv_times = 2 ) tune_grid <- get_tuning_grid(hyperparameters, method) rf_model <- train_model( training_data, "dx", method, cross_val, "AUC", tune_grid, ntree = 1000 ) rf_model$results %>% dplyr::select(mtry, AUC, prAUC) ## End(Not run)