Topic Modeling with LDA (Latent Dirichlet Allocation)

Overview

This tutorial is a start for getting familiar with working with text - We follow very closely the excellent set of materials here http://tidytextmining.com/topicmodeling.html#lda-on-chapters

0. Loading libraries & Read in data

library(tm) #to process text
library(topicmodels)
library(dplyr)
library(tidytext)
library(tidyverse)
library(SnowballC) # for stemming
library(stringr)
library(ldatuning)
library(gutenbergr)

1. Import data from the Gutenberg project

titles <- c("Pride and Prejudice")
books <- gutenberg_works(title %in% titles) %>%
  gutenberg_download(meta_fields = "title")

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

# divide into documents, each representing one chapter
by_chapter <- books %>%
  group_by(title) %>%
  mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
  ungroup() %>%
  filter(chapter > 0) %>%
  unite(document, title, chapter)

# split into words
by_chapter_word <- by_chapter %>%
  unnest_tokens(word, text)

# find document-word counts
word_counts <- by_chapter_word %>%
  anti_join(stop_words) %>%
  count(document, word, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

word_counts

## # A tibble: 26,281 x 3
##                  document      word     n
##                     <chr>     <chr> <int>
##  1 Pride and Prejudice_43 elizabeth    36
##  2 Pride and Prejudice_18     darcy    33
##  3 Pride and Prejudice_18 elizabeth    25
##  4 Pride and Prejudice_45      miss    22
##  5 Pride and Prejudice_16     darcy    21
##  6 Pride and Prejudice_29      lady    21
##  7 Pride and Prejudice_10     darcy    20
##  8  Pride and Prejudice_8   bingley    20
##  9 Pride and Prejudice_18   bingley    19
## 10 Pride and Prejudice_29 catherine    19
## # ... with 26,271 more rows

# cast the chapter word frequency into a document-term matrix. Here each document is a chapter in the book
chapters_dtm <- word_counts %>%
  cast_dtm(document, word, n)

inspect(chapters_dtm[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 77/23
## Sparsity           : 23%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## Sample             :
##                         Terms
## Docs                     bennet bingley catherine darcy elizabeth gardiner
##   Pride and Prejudice_10      5      18         0    20        15        0
##   Pride and Prejudice_16      3       4         5    21        18        0
##   Pride and Prejudice_18     10      19         3    33        25        0
##   Pride and Prejudice_29      2       0        19     1        12        0
##   Pride and Prejudice_43      0       1         0    14        36       17
##   Pride and Prejudice_45      2      10         0    14        16        6
##   Pride and Prejudice_47      7       0         0     3        18        6
##   Pride and Prejudice_55     11      13         1     1        17        0
##   Pride and Prejudice_56     18       1        14     6        19        0
##   Pride and Prejudice_8       5      20         0    10        14        0
##                         Terms
## Docs                     jane lady miss wickham
##   Pride and Prejudice_10    2    2   15       0
##   Pride and Prejudice_16    0    9    5      17
##   Pride and Prejudice_18   10   13   10      16
##   Pride and Prejudice_29    0   21    9       1
##   Pride and Prejudice_43    0    4    7       4
##   Pride and Prejudice_45    0    1   22       1
##   Pride and Prejudice_47   15    1    3      11
##   Pride and Prejudice_55   18    0    4       1
##   Pride and Prejudice_56    1   16   15       0
##   Pride and Prejudice_8     5    2   17       0

2. run LDA

Set number of topics as 4.

k <- 4
chapters_lda <- LDA(chapters_dtm, k = k, control = list(seed = 1234))
chapters_lda

## A LDA_VEM topic model with 4 topics.

# tease out the data of the topics
chapter_topics <- tidy(chapters_lda, matrix = "beta")
chapter_topics

## # A tibble: 24,032 x 3
##    topic      term        beta
##    <int>     <chr>       <dbl>
##  1     1 elizabeth 0.014524386
##  2     2 elizabeth 0.015598866
##  3     3 elizabeth 0.018916489
##  4     4 elizabeth 0.014766093
##  5     1     darcy 0.007627316
##  6     2     darcy 0.010601235
##  7     3     darcy 0.016103890
##  8     4     darcy 0.005320462
##  9     1      miss 0.003761612
## 10     2      miss 0.007545411
## # ... with 24,022 more rows

3. Plot top 20 words for each topic

# tease out the data of 20 top terms
top_terms <- chapter_topics %>%
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms

## # A tibble: 80 x 3
##    topic      term        beta
##    <int>     <chr>       <dbl>
##  1     1 elizabeth 0.014524386
##  2     1      jane 0.011826411
##  3     1     darcy 0.007627316
##  4     1   wickham 0.007085701
##  5     1    sister 0.007056486
##  6     1      time 0.006925121
##  7     1    bennet 0.006536751
##  8     1      dear 0.006252631
##  9     1     lydia 0.006082345
## 10     1    letter 0.005896662
## # ... with 70 more rows

# Plot the 20 top terms per topic
top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

# organize the data for topic proportion per document
doc_gamma <- tidy(chapters_lda, matrix = "gamma")
doc_gamma

## # A tibble: 244 x 3
##                  document topic        gamma
##                     <chr> <int>        <dbl>
##  1 Pride and Prejudice_43     1 1.950564e-05
##  2 Pride and Prejudice_18     1 1.729715e-05
##  3 Pride and Prejudice_45     1 4.812399e-05
##  4 Pride and Prejudice_16     1 2.671750e-05
##  5 Pride and Prejudice_29     1 3.500110e-05
##  6 Pride and Prejudice_10     1 3.986573e-05
##  7  Pride and Prejudice_8     1 4.330501e-05
##  8 Pride and Prejudice_56     1 3.314398e-05
##  9 Pride and Prejudice_47     1 9.999259e-01
## 10 Pride and Prejudice_55     1 9.998826e-01
## # ... with 234 more rows

# give a chapter number to the doc_gamma data frame
doc_gamma.df <- data.frame(doc_gamma)
doc_gamma.df$chapter <- rep(1:dim(chapters_dtm)[1],k)

4. Plot topic proportion along chapter number

ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma, group = factor(topic), color = factor(topic)))+
  geom_line()+
  facet_wrap(~factor(topic), ncol = 1)

5. Find optimal number of topics (k)

Based on the reference from https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html, there are four indices of topic modeling result.

result <- FindTopicsNumber(
chapters_dtm,
topics = seq(from = 2, to = 60, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 2L,
verbose = TRUE)

## fit models... done.
## calculate metrics:
##   Griffiths2004... done.
##   CaoJuan2009... done.
##   Arun2010... done.
##   Deveaud2014... done.

FindTopicsNumber_plot(result)