## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE ) ## ----------------------------------------------------------------------------- library(textmineR) # load movie_review dataset from text2vec data(movie_review, package = "text2vec") str(movie_review) # let's take a sample so the demo will run quickly # note: textmineR is generally quite scaleable, depending on your system set.seed(123) s <- sample(1:nrow(movie_review), 500) movie_review <- movie_review[ s , ] # create a document term matrix dtm <- CreateDtm(doc_vec = movie_review$review, # character vector of documents doc_names = movie_review$id, # document names, optional ngram_window = c(1, 2), # minimum and maximum n-gram length stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm stopwords::stopwords(source = "smart")), # this is the default value lower = TRUE, # lowercase - this is the default value remove_punctuation = TRUE, # punctuation - this is the default remove_numbers = TRUE, # numbers - this is the default verbose = FALSE, # Turn off status bar for this demo cpus = 2) # by default, this will be the max number of cpus available ## ----------------------------------------------------------------------------- dim(dtm) nrow(dtm) ncol(dtm) ## ----------------------------------------------------------------------------- head(colnames(dtm)) ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(colnames(dtm)), col.names = "colnames(dtm)") # tokens ## ----eval = FALSE------------------------------------------------------------- # head(rownames(dtm)) ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(rownames(dtm)), col.names = "rownames(dtm)") # document IDs ## ----------------------------------------------------------------------------- # get counts of tokens across the corpus tf_mat <- TermDocFreq(dtm = dtm) str(tf_mat) ## ----eval = FALSE------------------------------------------------------------- # # look at the most frequent tokens # head(tf_mat[ order(tf_mat$term_freq, decreasing = TRUE) , ], 10) # ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_mat[ order(tf_mat$term_freq, decreasing = TRUE) , ], 10), caption = "Ten most frequent tokens") ## ----------------------------------------------------------------------------- # look at the most frequent bigrams tf_bigrams <- tf_mat[ stringr::str_detect(tf_mat$term, "_") , ] ## ----eval = FALSE------------------------------------------------------------- # head(tf_bigrams[ order(tf_bigrams$term_freq, decreasing = TRUE) , ], 10) ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_bigrams[ order(tf_bigrams$term_freq, decreasing = TRUE) , ], 10), caption = "Ten most frequent bi-grams") ## ----------------------------------------------------------------------------- # remove offending tokens from the DTM dtm <- dtm[ , ! stringr::str_detect(colnames(dtm), "(^br$)|(_br$)|(^br_)") ] # re-construct tf_mat and tf_bigrams tf_mat <- TermDocFreq(dtm) tf_bigrams <- tf_mat[ stringr::str_detect(tf_mat$term, "_") , ] ## ----------------------------------------------------------------------------- head(tf_mat[ order(tf_mat$term_freq, decreasing = TRUE) , ], 10) ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_mat[ order(tf_mat$term_freq, decreasing = TRUE) , ], 10), caption = "Ten most frequent terms, '\\' removed") ## ----eval = FALSE------------------------------------------------------------- # head(tf_bigrams[ order(tf_bigrams$term_freq, decreasing = TRUE) , ], 10) ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_bigrams[ order(tf_bigrams$term_freq, decreasing = TRUE) , ], 10), caption = "Ten most frequent bi-grams, '\\' removed") ## ----------------------------------------------------------------------------- # summary of document lengths doc_lengths <- rowSums(dtm) summary(doc_lengths) ## ----------------------------------------------------------------------------- # remove any tokens that were in 3 or fewer documents dtm <- dtm[ , colSums(dtm > 0) > 3 ] # alternatively: dtm[ , tf_mat$term_freq > 3 ] tf_mat <- tf_mat[ tf_mat$term %in% colnames(dtm) , ] tf_bigrams <- tf_bigrams[ tf_bigrams$term %in% colnames(dtm) , ] ## ----------------------------------------------------------------------------- # what words are most associated with sentiment? tf_sentiment <- list(positive = TermDocFreq(dtm[ movie_review$sentiment == 1 , ]), negative = TermDocFreq(dtm[ movie_review$sentiment == 0 , ])) ## ----eval = FALSE------------------------------------------------------------- # head(tf_sentiment$positive[ order(tf_sentiment$positive$term_freq, decreasing = TRUE) , ], 10) # ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_sentiment$positive[ order(tf_sentiment$positive$term_freq, decreasing = TRUE) , ], 10) , caption = "Ten most-frequent positive tokens") ## ----eval = FALSE------------------------------------------------------------- # head(tf_sentiment$negative[ order(tf_sentiment$negative$term_freq, decreasing = TRUE) , ], 10) ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_sentiment$negative[ order(tf_sentiment$negative$term_freq, decreasing = TRUE) , ], 10), caption = "Ten most-frequent negative tokens") ## ----------------------------------------------------------------------------- # let's reweight by probability by class p_words <- colSums(dtm) / sum(dtm) # alternatively: tf_mat$term_freq / sum(tf_mat$term_freq) tf_sentiment$positive$conditional_prob <- tf_sentiment$positive$term_freq / sum(tf_sentiment$positive$term_freq) tf_sentiment$positive$prob_lift <- tf_sentiment$positive$conditional_prob - p_words tf_sentiment$negative$conditional_prob <- tf_sentiment$negative$term_freq / sum(tf_sentiment$negative$term_freq) tf_sentiment$negative$prob_lift <- tf_sentiment$negative$conditional_prob - p_words ## ----eval = FALSE------------------------------------------------------------- # # let's look again with new weights # head(tf_sentiment$positive[ order(tf_sentiment$positive$prob_lift, decreasing = TRUE) , ], 10) # ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_sentiment$positive[ order(tf_sentiment$positive$prob_lift, decreasing = TRUE) , ], 10), caption = "Reweighted: ten most relevant terms for positive sentiment") ## ----eval = FALSE------------------------------------------------------------- # head(tf_sentiment$negative[ order(tf_sentiment$negative$prob_lift, decreasing = TRUE) , ], 10) ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_sentiment$negative[ order(tf_sentiment$negative$prob_lift, decreasing = TRUE) , ], 10), caption = "Reweighted: ten most relevant terms for negative sentiment") ## ----------------------------------------------------------------------------- # what about bi-grams? tf_sentiment_bigram <- lapply(tf_sentiment, function(x){ x <- x[ stringr::str_detect(x$term, "_") , ] x[ order(x$prob_lift, decreasing = TRUE) , ] }) ## ----eval = FALSE------------------------------------------------------------- # head(tf_sentiment_bigram$positive, 10) ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_sentiment_bigram$positive, 10), caption = "Reweighted: ten most relevant bigrams for positive sentiment") ## ----eval = FALSE------------------------------------------------------------- # head(tf_sentiment_bigram$negative, 10) ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(tf_sentiment_bigram$negative, 10), caption = "Reweighted: ten most relevant bigrams for negative sentiment")