The leakr package integrates seamlessly with popular machine learning frameworks in R, allowing you to incorporate leakage detection directly into existing modelling workflows. This vignette demonstrates how to use leakr with caret, mlr3, and tidymodels frameworks.
Integration functions automatically extract relevant information from trained models and preprocessing pipelines, making it easy to audit your complete machine learning workflow for potential data leakage issues.
The caret package is widely used for classification and regression training. leakr can analyse caret training objects to detect leakage in the underlying data and preprocessing steps.
# Load required libraries
library(caret)
# Prepare iris data for caret
data(iris)
set.seed(123)
# Create train/test split using caret
train_index <- createDataPartition(iris$Species, p = 0.8, list = FALSE)
train_data <- iris[train_index, ]
test_data <- iris[-train_index, ]
# Train a model using caret
model_fit <- train(
Species ~ .,
data = train_data,
method = "rf",
trControl = trainControl(method = "cv", number = 5)
)
# Use leakr to audit the caret model
caret_audit <- leakr_from_caret(
train_obj = model_fit,
original_data = iris,
target_name = "Species"
)
print(caret_audit)# Example with preprocessing steps that might introduce leakage
set.seed(456)
# Create a more complex dataset
complex_data <- data.frame(
feature1 = rnorm(200),
feature2 = rnorm(200, 50, 10),
feature3 = sample(c("A", "B", "C"), 200, replace = TRUE),
target = factor(sample(c("positive", "negative"), 200, replace = TRUE))
)
# Add missing values to demonstrate preprocessing
complex_data$feature1[sample(1:200, 20)] <- NA
complex_data$feature2[sample(1:200, 15)] <- NA
# Create train/test split
train_idx <- createDataPartition(complex_data$target, p = 0.7, list = FALSE)
train_complex <- complex_data[train_idx, ]
test_complex <- complex_data[-train_idx, ]
# Define preprocessing with potential leakage risks
preprocess_recipe <- preProcess(
train_complex[, -4], # Exclude target
method = c("center", "scale", "medianImpute")
)
# Train model with preprocessing
model_complex <- train(
target ~ .,
data = train_complex,
method = "glm",
preProcess = c("center", "scale", "medianImpute"),
trControl = trainControl(method = "cv", number = 3)
)
# Audit the complex workflow
complex_audit <- leakr_from_caret(
train_obj = model_complex,
original_data = complex_data,
target_name = "target"
)
# Generate detailed summary
caret_summary <- leakr_summarise(complex_audit, show_config = TRUE)
print(caret_summary)The mlr3 ecosystem provides a modern, object-oriented approach to machine learning in R. leakr can extract information from mlr3 tasks and learners.
# Load mlr3 components
library(mlr3)
library(mlr3learners)
# Create an mlr3 task
iris_task <- TaskClassif$new(
id = "iris_classification",
backend = iris,
target = "Species"
)
# Use leakr to audit the mlr3 task
mlr3_audit <- leakr_from_mlr3(
task = iris_task,
include_target = TRUE
)
print(mlr3_audit)library(mlr3pipelines)
# Create a more complex dataset for demonstration
titanic_like <- data.frame(
age = c(rnorm(100, 35, 10), rep(NA, 20)),
fare = c(rnorm(100, 50, 20), rep(NA, 20)),
sex = sample(c("male", "female"), 120, replace = TRUE),
class = sample(c("1st", "2nd", "3rd"), 120, replace = TRUE),
survived = factor(sample(c("yes", "no"), 120, replace = TRUE)),
stringsAsFactors = TRUE
)
# Create task
survival_task <- TaskClassif$new(
id = "survival_prediction",
backend = titanic_like,
target = "survived"
)
# Create preprocessing pipeline that might introduce leakage
preprocessing_pipeline <- po("imputehist") %>>% # Imputation
po("scale") %>>% # Scaling
po("encode") # Factor encoding
# Create full pipeline with learner
full_pipeline <- preprocessing_pipeline %>>%
po("learner", lrn("classif.rpart"))
# Convert to learner
pipeline_learner <- as_learner(full_pipeline)
# Audit the mlr3 pipeline
pipeline_audit <- leakr_from_mlr3(
task = survival_task,
include_target = TRUE
)
# Detailed analysis
mlr3_summary <- leakr_summarise(pipeline_audit, top_n = 8)
print(mlr3_summary)The tidymodels framework provides a consistent approach to modelling in R. leakr can analyse workflows and recipes for potential leakage issues.
# Load tidymodels components
library(tidymodels)
# Create initial split
data(iris)
set.seed(789)
iris_split <- initial_split(iris, prop = 0.8, strata = Species)
# Create recipe
iris_recipe <- recipe(Species ~ ., data = training(iris_split)) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Create model specification
iris_model <- rand_forest(mode = "classification") %>%
set_engine("ranger")
# Create workflow
iris_workflow <- workflow() %>%
add_recipe(iris_recipe) %>%
add_model(iris_model)
# Use leakr to audit the tidymodels workflow
tidymodels_audit <- leakr_from_tidymodels(
workflow = iris_workflow,
data = iris
)
print(tidymodels_audit)# Create a dataset with potential feature engineering leakage
set.seed(987)
engineering_data <- data.frame(
customer_id = 1:300,
purchase_amount = rlnorm(300, 3, 1),
days_since_last = rpois(300, 30),
category = sample(c("electronics", "clothing", "books"), 300, replace = TRUE),
month = sample(1:12, 300, replace = TRUE),
will_return = factor(sample(c("yes", "no"), 300, replace = TRUE, prob = c(0.3, 0.7)))
)
# Add potential leakage: customer_lifetime_value (future information)
engineering_data$customer_lifetime_value <-
ifelse(engineering_data$will_return == "yes",
engineering_data$purchase_amount * runif(300, 2, 5),
engineering_data$purchase_amount * runif(300, 0.5, 1.5))
# Create data split
engineering_split <- initial_split(engineering_data, prop = 0.8, strata = will_return)
# Create comprehensive recipe with potential leakage sources
engineering_recipe <- recipe(will_return ~ ., data = training(engineering_split)) %>%
update_role(customer_id, new_role = "ID") %>%
step_log(purchase_amount, customer_lifetime_value) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_interact(terms = ~ purchase_amount:days_since_last) %>%
step_pca(all_numeric_predictors(), num_comp = 5)
# Create model specification
engineering_model <- logistic_reg() %>%
set_engine("glm")
# Create workflow
engineering_workflow <- workflow() %>%
add_recipe(engineering_recipe) %>%
add_model(engineering_model)
# Audit the complex tidymodels workflow
complex_tidymodels_audit <- leakr_from_tidymodels(
workflow = engineering_workflow,
data = engineering_data
)
# Generate detailed summary
tidymodels_summary <- leakr_summarise(
complex_tidymodels_audit,
top_n = 10,
show_config = TRUE
)
print(tidymodels_summary)leakr provides seamless integration with data import/export operations, allowing you to audit data as part of your data pipeline:
# Import data with automatic leakage checking
# This would typically be used with real files
example_data <- data.frame(
id = 1:100,
feature1 = rnorm(100),
feature2 = sample(letters[1:5], 100, replace = TRUE),
target = factor(sample(c("A", "B"), 100, replace = TRUE))
)
# Simulate importing and auditing in one step
imported_audit <- leakr_audit(
data = example_data,
target = "target",
id = "id"
)
# Quick import function (simulated)
leakr_quick_audit <- function(data_path, target, ...) {
# In practice, this would use leakr_import() followed by leakr_audit()
# data <- leakr_import(data_path, ...)
# audit <- leakr_audit(data, target = target)
# return(list(data = data, audit = audit))
# For demonstration
return(imported_audit)
}# Export data along with audit reports
export_config <- list(
include_audit_report = TRUE,
format = "comprehensive",
generate_summary = TRUE
)
# This would export both data and audit results
# leakr_export_data(
# data = example_data,
# file_path = "audited_dataset",
# audit_report = imported_audit,
# config = export_config
# )leakr supports data versioning and snapshot functionality for tracking changes in leakage patterns over time:
# Create snapshot of current data state
snapshot_info <- leakr_create_snapshot(
data = example_data,
name = "baseline_data",
metadata = list(
created_by = "data_scientist",
purpose = "baseline_analysis",
version = "1.0"
)
)
# List available snapshots
available_snapshots <- leakr_list_snapshots()
print(available_snapshots)
# Load previous snapshot for comparison
# previous_data <- leakr_load_snapshot("baseline_data")# Complete pre-training validation workflow
validate_before_training <- function(data, target, test_split = 0.2) {
# Step 1: Basic data validation
validated_data <- validate_and_preprocess_data(
data = data,
target = target,
split = NULL,
id = NULL,
config = list(remove_empty_cols = TRUE)
)
# Step 2: Create train/test split
set.seed(42)
n <- nrow(validated_data)
train_indices <- sample(1:n, (1 - test_split) * n)
split_vector <- rep("test", n)
split_vector[train_indices] <- "train"
# Step 3: Comprehensive leakage audit
audit_report <- leakr_audit(
data = validated_data,
target = target,
split = split_vector
)
# Step 4: Check for blocking issues
critical_issues <- length(audit_report$issues[
sapply(audit_report$issues, function(x) x$severity == "high")
])
if (critical_issues > 0) {
warning(paste("Found", critical_issues, "critical issues. Review before training."))
}
return(list(
data = validated_data,
split = split_vector,
audit = audit_report,
safe_to_train = critical_issues == 0
))
}
# Example usage
# validation_result <- validate_before_training(your_data, "target_column")
# if (validation_result$safe_to_train) {
# # Proceed with model training
# }# Post-training comprehensive audit
post_training_audit <- function(model_object, framework = "caret") {
audit_result <- switch(framework,
"caret" = leakr_from_caret(model_object),
"mlr3" = leakr_from_mlr3(model_object),
"tidymodels" = leakr_from_tidymodels(model_object),
stop("Unsupported framework")
)
# Generate comprehensive summary
summary_report <- leakr_summarise(
audit_result,
top_n = 15,
show_config = TRUE
)
return(list(
audit = audit_result,
summary = summary_report
))
}# Set up continuous monitoring for production data
setup_leakage_monitoring <- function(data_source, target, schedule = "daily") {
monitor_config <- list(
alert_threshold = "medium", # Alert on medium+ severity issues
notification_email = "data-team@company.com",
generate_plots = TRUE,
archive_reports = TRUE
)
# This would typically integrate with a scheduler like cron
monitoring_function <- function() {
# Load current data
current_data <- data_source() # Function to fetch current data
# Run audit
current_audit <- leakr_audit(
data = current_data,
target = target,
config = monitor_config
)
# Check for issues requiring attention
medium_high_issues <- length(current_audit$issues[
sapply(current_audit$issues, function(x) x$severity %in% c("medium", "high"))
])
if (medium_high_issues > 0) {
# Send alert (implementation would depend on your notification system)
message(paste("Leakage monitoring alert:", medium_high_issues, "issues detected"))
}
# Archive report with timestamp
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
# save(current_audit, file = paste0("audit_", timestamp, ".RData"))
return(current_audit)
}
return(monitoring_function)
}
# Example setup
# monitor <- setup_leakage_monitoring(
# data_source = function() { read.csv("daily_data.csv") },
# target = "outcome"
# )
# daily_audit <- monitor()# Configuration for large-scale processing
large_scale_config <- list(
sample_size = 10000, # Limit memory usage
chunk_processing = TRUE, # Process in chunks
parallel_detectors = FALSE, # Disable if memory constrained
save_intermediate = TRUE, # Save intermediate results
cleanup_temp = TRUE # Clean up temporary objects
)
# Process very large dataset efficiently
process_large_dataset <- function(data_path, target, config = large_scale_config) {
# Process in chunks if dataset is too large to fit in memory
if (file.size(data_path) > 1e9) { # > 1GB
# Implement chunked processing
message("Large dataset detected, using chunked processing")
# This would implement actual chunked reading and processing
# chunk_results <- process_in_chunks(data_path, target, config)
# combined_audit <- combine_audit_results(chunk_results)
# return(combined_audit)
} else {
# Standard processing for manageable datasets
data <- read.csv(data_path)
return(leakr_audit(data, target = target, config = config))
}
}# Create environment-specific configurations
development_config <- list(
sample_size = 1000,
generate_plots = TRUE,
detailed_logging = TRUE
)
production_config <- list(
sample_size = 10000,
generate_plots = FALSE,
detailed_logging = FALSE,
alert_on_issues = TRUE
)
testing_config <- list(
sample_size = 500,
run_all_detectors = TRUE,
strict_thresholds = TRUE
)# Robust error handling for production environments
safe_audit <- function(data, target, ...) {
tryCatch({
audit_result <- leakr_audit(data, target = target, ...)
# Log successful audit
message(paste("Audit completed successfully at", Sys.time()))
return(list(
success = TRUE,
audit = audit_result,
timestamp = Sys.time()
))
}, error = function(e) {
# Log error details
error_msg <- paste("Audit failed:", e$message)
warning(error_msg)
return(list(
success = FALSE,
error = error_msg,
timestamp = Sys.time()
))
})
}This vignette has demonstrated comprehensive integration patterns for using leakr with major R machine learning frameworks:
Key integration principles:
By following these patterns, you can ensure that leakage detection becomes a natural and reliable part of your machine learning workflows, helping maintain model integrity and reproducibility across different frameworks and environments.