## ---- echo=FALSE, message=FALSE------------------------------------------
# Load dependencies
library(plyr)
library(magrittr)
library(ggplot2)
library(clusternomics)

## ------------------------------------------------------------------------
set.seed(1)

# Number of elements in each cluster, follows the table given above
groupCounts <- c(50, 10, 40, 60)
# Centers of clusters
means <- c(-1.5,1.5)
# Helper function to generate test data
testData <- generateTestData_2D(groupCounts, means)
datasets <- testData$data

## ---- fig.width=6--------------------------------------------------------
qplot(datasets[[1]][,1], datasets[[1]][,2], col=factor(testData$groups)) + 
  geom_point(size=3) + 
  ggtitle("Context 1") + xlab("x") + ylab("y") +
  scale_color_discrete(name="Cluster")

## ---- fig.width=6--------------------------------------------------------
qplot(datasets[[2]][,1], datasets[[2]][,2], col=factor(testData$groups)) +
  geom_point(size=3) + 
  ggtitle("Context 2") + xlab("x") + ylab("y") +
  scale_color_discrete(name="Cluster")

## ------------------------------------------------------------------------
# Setup of the algorithm
dataDistributions <- 'diagNormal'
# Pre-specify number of clusters
clusterCounts <- list(global=10, context=c(3,3))
# Set number of iterations
# The following is ONLY FOR SIMULATION PURPOSES 
# Use larger number of iterations for real-life data
maxIter <- 300  
burnin <- 200
lag <- 2  # Thinning of samples

## ----runSampling, message=F----------------------------------------------
# Run context-dependent clustering
results <- contextCluster(datasets, clusterCounts, 
              maxIter = maxIter, burnin = burnin, lag = lag,
              dataDistributions = 'diagNormal',
              verbose = F)

# Extract resulting cluster assignments
samples <- results$samples  

# Extract global cluster assignments for each MCMC sample
clusters <- 
  laply(1:length(samples), function(i) samples[[i]]$Global) 

## ---- fig.width=6--------------------------------------------------------
logliks <- results$logliks

qplot(1:maxIter, logliks) + geom_line() + 
  xlab("MCMC iterations") +
  ylab("Log likelihood")

## ------------------------------------------------------------------------
wrongClusterCounts <- list(global=2, context=c(2,1))
worseResults <- 
  contextCluster(datasets, wrongClusterCounts, 
              maxIter = maxIter, burnin = burnin, lag = lag,
              dataDistributions = 'diagNormal',
              verbose = F)

print(paste('Original model has lower (better) DIC:', results$DIC))
print(paste('Worse model has higher (worse) DIC:', worseResults$DIC))

## ---- fig.width=6--------------------------------------------------------
cc <- numberOfClusters(clusters)
qplot(seq(from=burnin, to = maxIter, by=lag), cc) + 
  geom_line() + xlab("MCMC iterations") + ylab("Number of clusters") 

## ---- fig.width=6--------------------------------------------------------
clusterLabels <- unique(clusters %>% as.vector)
sizes <- matrix(nrow=nrow(clusters), ncol=length(clusterLabels)) 
for (ci in 1:length(clusterLabels)) {
  sizes[,ci] <- rowSums(clusters == clusterLabels[ci])
}
sizes <- sizes %>% as.data.frame
colnames(sizes) <- clusterLabels

boxplot(sizes,xlab="Global combined clusters", ylab="Cluster size")

## ------------------------------------------------------------------------
clusteringResult <- samples[[length(samples)]]

## ---- message=F, fig.width=5, fig.height=5-------------------------------
# Compute the co-clustering matrix from global cluster assignments
coclust <- coclusteringMatrix(clusters)

# Plot the co-clustering matrix as a heatmap
require(gplots)
mypalette <- colorRampPalette(rev(c('#d7191c','#fdae61','#ffffbf','#abd9e9','#4395d2')), 
                              space = "Lab")(100)
h <- heatmap.2(
  coclust, 
  col=mypalette, trace='none',
  dendrogram='row', labRow='', labCol='', key = TRUE,
  keysize = 1.5, density.info=c("none"),
  main="MCMC co-clustering matrix",
  scale = "none")

## ------------------------------------------------------------------------
diag(coclust) <- 1
fit <- hclust(as.dist(1 - coclust))
hardAssignments <- cutree(fit, k=4)

## ---- message=FALSE, fig.width=6-----------------------------------------
aris <- laply(1:nrow(clusters), 
              function(i) mclust::adjustedRandIndex(clusters[i,], testData$groups)) %>%
  as.data.frame
colnames(aris) <- "ARI"
aris$Iteration <- seq(from=burnin, to=maxIter, by=lag)
coclustAri <- mclust::adjustedRandIndex(hardAssignments, testData$groups)
aris$Coclust <- coclustAri
  
ggplot(aris, aes(x=Iteration, y=ARI, colour="MCMC iterations")) +
  geom_point() +
  ylim(0,1) +
  geom_smooth(size=1) + 
  theme_bw() +
  geom_line(aes(x=Iteration, y=Coclust, colour="Co-clustering matrix"), size=1) +
  scale_colour_discrete(name="Cluster assignments")