## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>", warning = FALSE
)

## -----------------------------------------------------------------------------

library(mvrsquared)

data(mtcars)

# fit a linear model
f <- lm(mpg ~ cyl + disp + hp + wt, data = mtcars)

# extract r-squared 
f_summary <- summary(f)

r2_lm <- f_summary$r.squared

r2_lm

# calculate univariate r-squared using mvrsquared
r2_mv <- calc_rsquared(y = mtcars$mpg, yhat = f$fitted.values)

r2_mv

# just to be 100% sure...
r2_lm == r2_mv


## -----------------------------------------------------------------------------

x <- cbind(1, f$model[, -1]) # note, you have to add 1's for the intercept and
                             # I'm removing the first column of f$model as it
                             # is the outcome we are predicting

x <- as.matrix(x) # x needs to be a matrix, not a data.frame or tibble for now

w <- matrix(f$coefficients, ncol = 1) # w also has to be a matrix

# this calculates yhat as the dot product x %*% w
r2_mv2 <- calc_rsquared(y = mtcars$mpg, 
                        yhat = list(x = x,
                                    w = w))

r2_mv2


## -----------------------------------------------------------------------------
r2_mv2 == r2_lm

## -----------------------------------------------------------------------------
round(r2_mv2, 14) == round(r2_lm, 14)

## -----------------------------------------------------------------------------
calc_rsquared(y = cbind(mtcars$mpg, mtcars$mpg),
              yhat = cbind(f$fitted.values, f$fitted.values))

## -----------------------------------------------------------------------------
library(nnet)

# let's generate some synthetic data
set.seed(666)

# Some continuous variables
x1 <- rnorm(n = 10000, mean = 1, sd = 2)
x2 <- rnorm(n = 10000, mean = 1.5, sd = 2.5)
x3 <- rnorm(n = 10000, mean = .6, sd = 1.5)

# linear combinations used to generate outcomes with logit functions
z1 <- 1 - 1.2 * x1 + 4.5 * x2 - 2.8 * x3 

z2 <- -2 + 2.8 * x1 - 1.2 * x2 + 4.5 * x3 

y1 <- rbinom(10000, 1, prob = 1 / (1 + exp(-z1)))

y2 <- rbinom(10000, 1, prob = 1 / (1 + exp(-z2)))

# fit a multinomial model using a 1-layer neural net with 10 nodes
f_mv <- nnet(cbind(y1, y2) ~ x1 + x2 + x3, 
             size = 10)

yhat <- predict(f_mv, data.frame(x1 = x1, x2 = x2, x3 = x3), type = "raw")

# and now calculate r-squared
calc_rsquared(y = cbind(y1, y2), yhat = yhat)


## -----------------------------------------------------------------------------
library(tidytext)
library(textmineR)
library(dplyr)
library(stringr)

# load documents in a data frame
docs <- nih_sample 

# tokenize using tidytext's unnest_tokens
tidy_docs <- docs %>% 
  select(APPLICATION_ID, ABSTRACT_TEXT) %>% 
  unnest_tokens(output = word, 
                input = ABSTRACT_TEXT,
                stopwords = stop_words$word,
                token = "ngrams",
                n_min = 1, n = 2) %>% 
  count(APPLICATION_ID, word) %>% 
  filter(n>1) #Filtering for words/bigrams per document, rather than per corpus

tidy_docs <- tidy_docs %>% # filter words that are just numbers
  filter(! str_detect(tidy_docs$word, "^[0-9]+$"))

# turn a tidy tbl into a sparse dgCMatrix for use in textmineR
dtm <- tidy_docs %>% 
  cast_sparse(APPLICATION_ID, word, n)


# create a topic model
lda <- FitLdaModel(dtm = dtm, 
                   k = 20,
                   iterations = 200,
                   burnin = 175)


## -----------------------------------------------------------------------------
r2_lda <- calc_rsquared(y = dtm, 
                        yhat = list(x = rowSums(dtm) * lda$theta, w = lda$phi))

r2_lda


## -----------------------------------------------------------------------------

lsa <- FitLsaModel(dtm = dtm, k = 20)

r2_lsa <- calc_rsquared(y = dtm,
                        yhat = list(x = lsa$theta %*% diag(lsa$sv), w = lsa$phi))

r2_lsa


## ----eval = FALSE-------------------------------------------------------------
#  library(parallel)
#  
#  batch_size <- 10
#  
#  batches <- mclapply(X = seq(1, nrow(dtm), by = batch_size),
#                      FUN = function(b){
#  
#                        # rows to select on
#                        rows <- b:min(b + batch_size - 1, nrow(dtm))
#  
#                        # rows of the dtm
#                        y_batch <- dtm[rows, ]
#  
#                        # rows of theta multiplied by document length
#                        x_batch <- rowSums(y_batch) * lda$theta[rows, ]
#  
#                        list(y = y_batch,
#                             x = x_batch)
#                      }, mc.cores = 2)
#  
#  
#  # calculate ybar for the data
#  # in this case, lazily doing colMeans, but you could divide this problem up too
#  ybar <- colMeans(dtm)
#  
#  # MAP: calculate sums of squares
#  ss <- mclapply(X = batches,
#                 FUN = function(batch){
#                   calc_rsquared(y = batch$y,
#                                 yhat = list(x = batch$x, w = lda$phi),
#                                 ybar = ybar,
#                                 return_ss_only = TRUE)
#                 }, mc.cores = 2)
#  
#  
#  # REDUCE: get SST and SSE by summation
#  ss <- do.call(rbind, ss) %>% colSums()
#  
#  r2_mapreduce <- 1 - ss["sse"] / ss["sst"]
#  
#  # should be the same as above
#  r2_mapreduce
#