## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----embedding---------------------------------------------------------------- library(textmineR) # load the data data(movie_review, package = "text2vec") # let's take a sample so the demo will run quickly # note: textmineR is generally quite scaleable, depending on your system set.seed(123) s <- sample(1:nrow(movie_review), 200) movie_review <- movie_review[ s , ] # let's get those nasty "
" symbols out of the way movie_review$review <- stringr::str_replace_all(movie_review$review, "
", "") # First create a TCM using skip grams, we'll use a 5-word window # most options available on CreateDtm are also available for CreateTcm tcm <- CreateTcm(doc_vec = movie_review$review, skipgram_window = 10, verbose = FALSE, cpus = 2) # use LDA to get embeddings into probability space # This will take considerably longer as the TCM matrix has many more rows # than a DTM embeddings <- FitLdaModel(dtm = tcm, k = 50, iterations = 200, burnin = 180, alpha = 0.1, beta = 0.05, optimize_alpha = TRUE, calc_likelihood = FALSE, calc_coherence = FALSE, calc_r2 = FALSE, cpus = 2) ## ----eval = FALSE------------------------------------------------------------- # # parse it into sentences # sent <- stringi::stri_split_boundaries(doc, type = "sentence")[[ 1 ]] # # names(sent) <- seq_along(sent) # so we know index and order # # # embed the sentences in the model # e <- CreateDtm(sent, ngram_window = c(1,1), verbose = FALSE, cpus = 2) # # # remove any documents with 2 or fewer words # e <- e[ rowSums(e) > 2 , ] # # vocab <- intersect(colnames(e), colnames(gamma)) # # e <- e / rowSums(e) # # e <- e[ , vocab ] %*% t(gamma[ , vocab ]) # # e <- as.matrix(e) # ## ----eval = FALSE------------------------------------------------------------- # # get the pairwise distances between each embedded sentence # e_dist <- CalcHellingerDist(e) ## ----eval = FALSE------------------------------------------------------------- # # turn into a similarity matrix # g <- (1 - e_dist) * 100 ## ----eval = FALSE------------------------------------------------------------- # # we don't need sentences connected to themselves # diag(g) <- 0 # # # turn into a nearest-neighbor graph # g <- apply(g, 1, function(x){ # x[ x < sort(x, decreasing = TRUE)[ 3 ] ] <- 0 # x # }) # # # by taking pointwise max, we'll make the matrix symmetric again # g <- pmax(g, t(g)) ## ----eval = FALSE------------------------------------------------------------- # g <- graph.adjacency(g, mode = "undirected", weighted = TRUE) # # # calculate eigenvector centrality # ev <- evcent(g) # # # format the result # result <- sent[ names(ev$vector)[ order(ev$vector, decreasing = TRUE)[ 1:3 ] ] ] # # result <- result[ order(as.numeric(names(result))) ] # # paste(result, collapse = " ") ## ----summaries---------------------------------------------------------------- library(igraph) # let's do this in a function summarizer <- function(doc, gamma) { # recursive fanciness to handle multiple docs at once if (length(doc) > 1 ) # use a try statement to catch any weirdness that may arise return(sapply(doc, function(d) try(summarizer(d, gamma)))) # parse it into sentences sent <- stringi::stri_split_boundaries(doc, type = "sentence")[[ 1 ]] names(sent) <- seq_along(sent) # so we know index and order # embed the sentences in the model e <- CreateDtm(sent, ngram_window = c(1,1), verbose = FALSE, cpus = 2) # remove any documents with 2 or fewer words e <- e[ rowSums(e) > 2 , ] vocab <- intersect(colnames(e), colnames(gamma)) e <- e / rowSums(e) e <- e[ , vocab ] %*% t(gamma[ , vocab ]) e <- as.matrix(e) # get the pairwise distances between each embedded sentence e_dist <- CalcHellingerDist(e) # turn into a similarity matrix g <- (1 - e_dist) * 100 # we don't need sentences connected to themselves diag(g) <- 0 # turn into a nearest-neighbor graph g <- apply(g, 1, function(x){ x[ x < sort(x, decreasing = TRUE)[ 3 ] ] <- 0 x }) # by taking pointwise max, we'll make the matrix symmetric again g <- pmax(g, t(g)) g <- graph.adjacency(g, mode = "undirected", weighted = TRUE) # calculate eigenvector centrality ev <- evcent(g) # format the result result <- sent[ names(ev$vector)[ order(ev$vector, decreasing = TRUE)[ 1:3 ] ] ] result <- result[ order(as.numeric(names(result))) ] paste(result, collapse = " ") } ## ----------------------------------------------------------------------------- # Let's see the summary of the first couple of reviews docs <- movie_review$review[ 1:3 ] names(docs) <- movie_review$id[ 1:3 ] sums <- summarizer(docs, gamma = embeddings$gamma) sums