Healthcare Data Quality Example

Introduction

This vignette demonstrates using autoFlagR for data quality auditing in a healthcare context. We’ll work through a complete example using simulated Electronic Health Records (EHR) data.

Load Required Packages

library(autoFlagR)
library(dplyr)
library(ggplot2)

Create Example Healthcare Dataset

set.seed(123)

# Simulate healthcare data
n_patients <- 500
healthcare_data <- data.frame(
  patient_id = 1:n_patients,
  age = round(rnorm(n_patients, 55, 15)),
  systolic_bp = round(rnorm(n_patients, 120, 15)),
  diastolic_bp = round(rnorm(n_patients, 80, 10)),
  cholesterol = round(rnorm(n_patients, 200, 40)),
  glucose = round(rnorm(n_patients, 100, 20)),
  bmi = round(rnorm(n_patients, 28, 5), 1),
  gender = sample(c("Male", "Female"), n_patients, replace = TRUE),
  diagnosis = sample(c("Hypertension", "Diabetes", "Normal"), n_patients, replace = TRUE, prob = c(0.3, 0.2, 0.5))
)

# Introduce known anomalies
healthcare_data$age[1:10] <- c(250, 180, 200, 190, 185, 175, 170, 165, 160, 155)  # Impossible ages
healthcare_data$systolic_bp[11:15] <- c(300, 280, 290, 275, 285)  # Extreme blood pressure
healthcare_data$cholesterol[16:20] <- c(600, 580, 590, 570, 585)  # Very high cholesterol
healthcare_data$glucose[21:25] <- c(5, 3, 4, 2, 6)  # Unrealistically low glucose

# Create ground truth labels and add to data
healthcare_data$is_anomaly_truth <- rep(FALSE, n_patients)
healthcare_data$is_anomaly_truth[1:25] <- TRUE  # First 25 are anomalies

head(healthcare_data)
#>   patient_id age systolic_bp diastolic_bp cholesterol glucose  bmi gender
#> 1          1 250         111           70         167      90 24.6   Male
#> 2          2 180         105           70         188     105 30.9 Female
#> 3          3 200         135           80         164      89 24.5 Female
#> 4          4 190         131           79         225     124 25.3 Female
#> 5          5 185          97           55         245     103 31.9 Female
#> 6          6 175         119           90         285      88 25.6 Female
#>      diagnosis is_anomaly_truth
#> 1 Hypertension             TRUE
#> 2     Diabetes             TRUE
#> 3       Normal             TRUE
#> 4       Normal             TRUE
#> 5 Hypertension             TRUE
#> 6       Normal             TRUE

Preprocess Data

# Prepare data for anomaly detection
prepared <- prep_for_anomaly(
  healthcare_data,
  id_cols = "patient_id",
  scale_method = "mad"
)

# View preprocessing metadata
str(attr(prepared, "metadata"))
#>  NULL

Score Anomalies

# Score anomalies using Isolation Forest
scored_data <- score_anomaly(
  healthcare_data,
  method = "iforest",
  contamination = 0.05,
  ground_truth_col = "is_anomaly_truth",
  id_cols = "patient_id"
)
#> Warning in (function (data, sample_size = min(nrow(data), 10000L), ntrees =
#> 500, : Attempting to use more than 1 thread, but package was compiled without
#> OpenMP support. See
#> https://github.com/david-cortes/installing-optimized-libraries#4-macos-install-and-enable-openmp

# View summary statistics
summary(scored_data$anomaly_score)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.0000  0.6605  0.7630  0.7448  0.8583  1.0000

Flag Top Anomalies

# Flag top anomalies
flagged_data <- flag_top_anomalies(
  scored_data,
  contamination = 0.05
)

# Count anomalies
cat("Total anomalies flagged:", sum(flagged_data$is_anomaly), "\n")
#> Total anomalies flagged: 25
cat("Anomaly rate:", mean(flagged_data$is_anomaly) * 100, "%\n")
#> Anomaly rate: 5 %

Visualize Results

# Plot anomaly score distribution
ggplot(flagged_data, aes(x = anomaly_score)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7, color = "black") +
  geom_vline(xintercept = attr(flagged_data, "anomaly_threshold"),
             color = "red", linetype = "dashed", linewidth = 1) +
  labs(
    title = "Distribution of Anomaly Scores",
    x = "Anomaly Score",
    y = "Frequency"
  ) +
  theme_minimal()

Extract Top Anomalies

# Get top 10 anomalies
top_anomalies <- get_top_anomalies(flagged_data, n = 10)

# View top anomalies
top_anomalies[, c("patient_id", "age", "systolic_bp", "cholesterol", 
                  "glucose", "anomaly_score", "is_anomaly")]
#>    patient_id age systolic_bp cholesterol glucose anomaly_score is_anomaly
#> 1         239  60         119         191     104     1.0000000       TRUE
#> 2         171  52         112         179      99     0.9874545       TRUE
#> 3          51  59         133         199      96     0.9837984       TRUE
#> 4         299  55         123         218      87     0.9836844       TRUE
#> 5          59  57         117         158     104     0.9749733       TRUE
#> 6         267  53         139         207     108     0.9699068       TRUE
#> 7         290  58         110         202     113     0.9602628       TRUE
#> 8         277  58         120         222      86     0.9584102       TRUE
#> 9          48  48         130         225     100     0.9574119       TRUE
#> 10         79  58          97         198      90     0.9572089       TRUE

Benchmarking (if ground truth available)

# Extract benchmark metrics
if (!is.null(attr(scored_data, "benchmark_metrics"))) {
  metrics <- extract_benchmark_metrics(scored_data)
  
  cat("AUC-ROC:", metrics$auc_roc, "\n")
  cat("AUC-PR:", metrics$auc_pr, "\n")
  cat("Top-10 Recall:", metrics$top_k_recall$top_10, "\n")
  cat("Top-50 Recall:", metrics$top_k_recall$top_50, "\n")
}
#> AUC-ROC: 0.9843368 
#> AUC-PR: 0.02557768 
#> Top-10 Recall: 0 
#> Top-50 Recall: 0

Generate Comprehensive Report

# Generate PDF audit report (saves to tempdir() by default)
generate_audit_report(
  healthcare_data,
  filename = "healthcare_audit_report",
  output_dir = tempdir(),
  output_format = "pdf",
  method = "iforest",
  contamination = 0.05,
  ground_truth_col = "is_anomaly_truth",
  id_cols = "patient_id"
)

The report will include: - Executive summary with key metrics - Anomaly score distribution - Prioritized audit listing (heatmap) - Bivariate visualizations - Distribution comparisons - Benchmarking results (if ground truth provided)

Summary

This example demonstrated: 1. Creating and preprocessing healthcare data 2. Scoring anomalies using Isolation Forest 3. Flagging top anomalies for review 4. Visualizing results 5. Extracting benchmark metrics 6. Generating professional audit reports

For more details, see the Function Reference.