## ----------------------------------------------------------------------------- library(heuristica) ## ----------------------------------------------------------------------------- data_set <- city_population_original criterion_col <- 3 # Population cols_to_fit <- 4:ncol(data_set) # The 9 cues reg <- regInterceptModel(data_set, criterion_col, cols_to_fit) ttb <- ttbModel(data_set, criterion_col, cols_to_fit) unit <- unitWeightModel(data_set, criterion_col, cols_to_fit) min <- minModel(data_set, criterion_col, cols_to_fit) ## ----------------------------------------------------------------------------- results <- percentCorrectList(data_set, list(reg, ttb, unit, min)) # Round values to make comparison easier. round(results) ## ----------------------------------------------------------------------------- # Cross-validate the data over the vector of models, suing training-proportion # of the data to train and the rest as a test set. Outputs the mean percent # correct for each model in fitting and in prediction. crossV <- function(vec_of_models, criterion_col, cols_to_fit, data, reps, training_proportion){ fitting <- vector() prediction <- vector() for(i in 1:reps){ #randomly sample training and test row indexes train <- sample(1:nrow(data), nrow(data)*training_proportion) test <- setdiff(1:nrow(data), train) #create training and test set training_set <- data[train,] test_set <- data[test,] # If a regression is overdetermined (e.g. has too many columns(), it will # drop the right-most columns. To instead make it drop random columns, # we shuffle the column order. shuffled_cols_to_fit <- sample(cols_to_fit) models<-list() y <- 0 for (mod in vec_of_models) { #fit the models to the training_set y <- y+1 models[[y]] <- mod(training_set, criterion_col, shuffled_cols_to_fit) } #calculate percentage of correct predictions fittingAccuracy <- percentCorrectList(training_set, models) predictionAccuracy <- percentCorrectList(test_set, models) fitting <- rbind(fitting,fittingAccuracy) prediction <- rbind(prediction,predictionAccuracy) } results <- (rbind(colMeans(fitting),colMeans(prediction))) rownames(results) <- c("Fitting","Prediction") results } ## ----------------------------------------------------------------------------- set.seed(0) # Use the same seed if you want to reproduce same results. reps <- 200 # The book used 10,000 but that takes minutes. training_proportion <- 0.5 results <- crossV(c(regInterceptModel, ttbModel, unitWeightModel, minModel), criterion_col, cols_to_fit, data_set, reps,training_proportion) # Round values to make comparison easier. round(results, 1) ## ----fig.width=7, fig.height=5------------------------------------------------ library(ggplot2) library(reshape) p <- melt(results) colnames(p) <- c("condition","model","value") ggplot(p, aes(x=condition, y=value, colour=model,group=model)) + geom_point() + geom_line() + ylab("Proportion correct")