---
title: "Advanced Leakage Detection with leakr"
author: "Cheryl Isabella Lim"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Advanced Leakage Detection with leakr}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
```

```{r setup}
library(leakr)
```

## Introduction

This vignette explores advanced usage patterns for the leakr package, demonstrating how to detect subtle leakage patterns and customise the detection process for specific scenarios. We'll cover complex datasets, configuration options, and best practices for comprehensive leakage detection.

## Understanding leakr's Detection Capabilities

The leakr package can identify various types of data leakage that might compromise model validity:

```{r detector_overview}
# View available detectors
available_detectors <- list_registered_detectors()
print(available_detectors)
```

## Advanced Target Leakage Scenarios

Target leakage can be subtle and context-dependent. Let's explore some realistic scenarios:

### Medical Diagnosis Example

```{r medical_example}
# Create a medical dataset with subtle leakage
set.seed(456)
n <- 500

medical_data <- data.frame(
  patient_id = 1:n,
  age = sample(25:75, n, replace = TRUE),
  bmi = rnorm(n, 25, 4),
  blood_pressure = rnorm(n, 120, 15),
  diagnosis = factor(sample(c("healthy", "diseased"), n, replace = TRUE, prob = c(0.8, 0.2)))
)

# Add a leaky feature: treatment_received (only available post-diagnosis)
medical_data$treatment_received <- ifelse(
  medical_data$diagnosis == "diseased", 
  sample(c("yes", "no"), sum(medical_data$diagnosis == "diseased"), replace = TRUE, prob = c(0.9, 0.1)),
  "no"
)

# Audit the medical data
medical_report <- leakr_audit(
  data = medical_data,
  target = "diagnosis",
  id = "patient_id"
)

print(medical_report)
```

### Financial Data with Temporal Issues

```{r financial_example}
# Create financial data with temporal leakage
set.seed(789)
dates <- seq(as.Date("2020-01-01"), as.Date("2023-12-31"), by = "month")

financial_data <- data.frame(
  account_id = 1:200,
  transaction_date = sample(dates, 200, replace = TRUE),
  amount = rlnorm(200, 4, 1),
  account_balance = rnorm(200, 1000, 500),
  default_risk = factor(sample(c("low", "high"), 200, replace = TRUE))
)

# Sort by date
financial_data <- financial_data[order(financial_data$transaction_date), ]

# Add feature that uses future information (credit score after default assessment)
financial_data$credit_score_updated <- ifelse(
  financial_data$default_risk == "high",
  rnorm(sum(financial_data$default_risk == "high"), 450, 50),
  rnorm(sum(financial_data$default_risk == "low"), 750, 75)
)

# Create temporal split
financial_data$split <- ifelse(
  financial_data$transaction_date < as.Date("2022-01-01"), 
  "train", 
  "test"
)

# Audit financial data
financial_report <- leakr_audit(
  data = financial_data,
  target = "default_risk",
  split = "split",
  id = "account_id"
)

print(financial_report)
```

## Advanced Duplication Detection

### Near-Duplicate Detection in Customer Data

```{r customer_duplicates}
# Create customer dataset with near-duplicates
set.seed(321)

# Original customers
customers <- data.frame(
  name = c("John Smith", "Jane Doe", "Bob Johnson", "Alice Brown", "Charlie Davis"),
  email = c("john@email.com", "jane@email.com", "bob@email.com", "alice@email.com", "charlie@email.com"),
  age = c(35, 28, 42, 31, 39),
  income = c(50000, 45000, 75000, 55000, 62000),
  purchase_category = factor(c("electronics", "books", "clothing", "electronics", "books"))
)

# Create near-duplicates with slight variations
near_dupes <- customers[1:3, ]
near_dupes$name <- c("J Smith", "Jane D", "Robert Johnson")  # Name variations
near_dupes$email <- c("john.smith@email.com", "j.doe@email.com", "bob.johnson@email.com")  # Email variations
near_dupes$age <- near_dupes$age + c(1, 0, -1)  # Age variations

# Combine datasets
all_customers <- rbind(customers, near_dupes)
all_customers$customer_id <- 1:nrow(all_customers)

# Audit for duplicates
dup_report <- leakr_audit(
  data = all_customers,
  target = "purchase_category",
  id = "customer_id"
)

print(dup_report)
```

## Configuration and Customisation

### Custom Configuration Options

```{r configuration}
# Example of custom configuration for sensitive detection
sensitive_config <- list(
  sample_size = 5000,           # Limit sample size for large datasets
  correlation_threshold = 0.7,   # Lower threshold for correlation-based detection
  duplicate_threshold = 0.9      # Threshold for considering records as duplicates
)

# Apply custom configuration
iris_sensitive <- leakr_audit(
  data = iris,
  target = "Species",
  config = sensitive_config
)

print(iris_sensitive)
```

## Working with Large Datasets

### Stratified Sampling for Balanced Analysis

```{r large_dataset}
# Create a large imbalanced dataset
set.seed(555)
large_n <- 10000

large_data <- data.frame(
  feature1 = rnorm(large_n),
  feature2 = sample(letters[1:10], large_n, replace = TRUE),
  feature3 = rnorm(large_n, 100, 20),
  # Imbalanced target
  target = factor(sample(c("rare", "common"), large_n, replace = TRUE, prob = c(0.05, 0.95)))
)

# Use stratified sampling to ensure representation
sample_indices <- stratified_sample(large_data$target, 1000)
sampled_data <- large_data[sample_indices, ]

# Verify sampling maintained class balance
table(large_data$target)
table(sampled_data$target)

# Audit sampled data
large_report <- leakr_audit(
  data = sampled_data,
  target = "target"
)

print(large_report)
```

## Advanced Reporting and Analysis

### Detailed Report Analysis

```{r detailed_analysis}
# Create complex dataset for comprehensive analysis
complex_data <- data.frame(
  id = 1:300,
  timestamp = seq(as.POSIXct("2023-01-01"), as.POSIXct("2023-12-31"), length.out = 300),
  feature_a = rnorm(300),
  feature_b = sample(LETTERS[1:5], 300, replace = TRUE),
  feature_c = rnorm(300, 50, 10),
  outcome = factor(sample(c("success", "failure"), 300, replace = TRUE))
)

# Add intentional leakage for demonstration
complex_data$leaky_feature <- ifelse(complex_data$outcome == "success", 1, 0)

# Generate comprehensive audit
detailed_report <- leakr_audit(
  data = complex_data,
  target = "outcome",
  id = "id"
)

# Generate detailed summary
detailed_summary <- leakr_summarise(detailed_report, top_n = 10, show_config = TRUE)
print(detailed_summary)
```

## Best Practices for Advanced Usage

### 1. Multi-Stage Validation

Implement a systematic approach to leakage detection:

```{r multi_stage}
# Multi-stage validation function
comprehensive_validation <- function(data, target, id = NULL, split = NULL) {
  
  cat("Stage 1: Basic data validation\n")
  # Basic preprocessing and validation
  clean_data <- validate_and_preprocess_data(data, target, split, id)
  
  cat("Stage 2: Initial leakage screening\n")
  # Quick initial screening
  initial_report <- leakr_audit(clean_data, target = target, split = split, id = id)
  
  cat("Stage 3: Detailed analysis\n")
  # Generate detailed summary
  summary_report <- leakr_summarise(initial_report, top_n = 15, show_config = TRUE)
  
  # Count critical issues
  if(length(initial_report$issues) > 0) {
    critical_count <- sum(sapply(initial_report$issues, function(x) 
      !is.null(x$severity) && x$severity == "high"))
    
    if(critical_count > 0) {
      cat("WARNING:", critical_count, "critical issues detected!\n")
    }
  }
  
  return(list(
    data = clean_data,
    audit = initial_report,
    summary = summary_report
  ))
}

# Example usage
# validation_result <- comprehensive_validation(your_data, "target_column")
```

### 2. Domain-Specific Validation

Adapt validation to your specific domain:

```{r domain_specific}
# Example: E-commerce specific validation
ecommerce_validation <- function(data, target) {
  
  # Standard audit
  base_report <- leakr_audit(data, target = target)
  
  # Domain-specific checks
  issues <- list()
  
  # Check for post-purchase features
  post_purchase_patterns <- c("return", "refund", "satisfaction", "rating")
  feature_names <- names(data)
  
  for(pattern in post_purchase_patterns) {
    matching_features <- grep(pattern, feature_names, value = TRUE, ignore.case = TRUE)
    if(length(matching_features) > 0) {
      issues <- append(issues, paste("Potential post-purchase feature:", 
                                   paste(matching_features, collapse = ", ")))
    }
  }
  
  if(length(issues) > 0) {
    cat("Domain-specific warnings:\n")
    for(issue in issues) {
      cat("-", issue, "\n")
    }
  }
  
  return(base_report)
}

# Example e-commerce data
ecommerce_data <- data.frame(
  customer_id = 1:100,
  purchase_amount = rlnorm(100, 4, 1),
  product_category = sample(c("electronics", "books", "clothing"), 100, replace = TRUE),
  customer_satisfaction = sample(1:5, 100, replace = TRUE),  # Post-purchase!
  will_repurchase = factor(sample(c("yes", "no"), 100, replace = TRUE))
)

# Validate e-commerce data
ecommerce_report <- ecommerce_validation(ecommerce_data, "will_repurchase")
```

## Summary

This vignette has demonstrated advanced leakage detection techniques including:

- **Complex leakage scenarios** in medical and financial domains
- **Near-duplicate detection** for customer data
- **Configuration customisation** for different sensitivity requirements
- **Large dataset handling** with stratified sampling
- **Multi-stage validation** workflows
- **Domain-specific validation** approaches

The key to effective leakage detection is understanding your data domain and systematically applying appropriate detection techniques. leakr provides the flexibility to adapt these techniques to your specific requirements whilst maintaining robust detection capabilities.

For integration with popular ML frameworks, see the "Framework Integration" vignette.