This tutorial is a start for getting familiar with working with text - We follow very closely the excellent set of materials here http://tidytextmining.com/topicmodeling.html#lda-on-chapters
library(tm) #to process text
library(topicmodels)
library(dplyr)
library(tidytext)
library(tidyverse)
library(SnowballC) # for stemming
library(stringr)
library(ldatuning)
library(gutenbergr)
titles <- c("Pride and Prejudice")
books <- gutenberg_works(title %in% titles) %>%
gutenberg_download(meta_fields = "title")
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# divide into documents, each representing one chapter
by_chapter <- books %>%
group_by(title) %>%
mutate(chapter = cumsum(str_detect(text, regex("^chapter ", ignore_case = TRUE)))) %>%
ungroup() %>%
filter(chapter > 0) %>%
unite(document, title, chapter)
# split into words
by_chapter_word <- by_chapter %>%
unnest_tokens(word, text)
# find document-word counts
word_counts <- by_chapter_word %>%
anti_join(stop_words) %>%
count(document, word, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
word_counts
## # A tibble: 26,281 x 3
## document word n
## <chr> <chr> <int>
## 1 Pride and Prejudice_43 elizabeth 36
## 2 Pride and Prejudice_18 darcy 33
## 3 Pride and Prejudice_18 elizabeth 25
## 4 Pride and Prejudice_45 miss 22
## 5 Pride and Prejudice_16 darcy 21
## 6 Pride and Prejudice_29 lady 21
## 7 Pride and Prejudice_10 darcy 20
## 8 Pride and Prejudice_8 bingley 20
## 9 Pride and Prejudice_18 bingley 19
## 10 Pride and Prejudice_29 catherine 19
## # ... with 26,271 more rows
# cast the chapter word frequency into a document-term matrix. Here each document is a chapter in the book
chapters_dtm <- word_counts %>%
cast_dtm(document, word, n)
inspect(chapters_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 77/23
## Sparsity : 23%
## Maximal term length: 9
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs bennet bingley catherine darcy elizabeth gardiner
## Pride and Prejudice_10 5 18 0 20 15 0
## Pride and Prejudice_16 3 4 5 21 18 0
## Pride and Prejudice_18 10 19 3 33 25 0
## Pride and Prejudice_29 2 0 19 1 12 0
## Pride and Prejudice_43 0 1 0 14 36 17
## Pride and Prejudice_45 2 10 0 14 16 6
## Pride and Prejudice_47 7 0 0 3 18 6
## Pride and Prejudice_55 11 13 1 1 17 0
## Pride and Prejudice_56 18 1 14 6 19 0
## Pride and Prejudice_8 5 20 0 10 14 0
## Terms
## Docs jane lady miss wickham
## Pride and Prejudice_10 2 2 15 0
## Pride and Prejudice_16 0 9 5 17
## Pride and Prejudice_18 10 13 10 16
## Pride and Prejudice_29 0 21 9 1
## Pride and Prejudice_43 0 4 7 4
## Pride and Prejudice_45 0 1 22 1
## Pride and Prejudice_47 15 1 3 11
## Pride and Prejudice_55 18 0 4 1
## Pride and Prejudice_56 1 16 15 0
## Pride and Prejudice_8 5 2 17 0
Set number of topics as 4.
k <- 4
chapters_lda <- LDA(chapters_dtm, k = k, control = list(seed = 1234))
chapters_lda
## A LDA_VEM topic model with 4 topics.
# tease out the data of the topics
chapter_topics <- tidy(chapters_lda, matrix = "beta")
chapter_topics
## # A tibble: 24,032 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 elizabeth 0.014524386
## 2 2 elizabeth 0.015598866
## 3 3 elizabeth 0.018916489
## 4 4 elizabeth 0.014766093
## 5 1 darcy 0.007627316
## 6 2 darcy 0.010601235
## 7 3 darcy 0.016103890
## 8 4 darcy 0.005320462
## 9 1 miss 0.003761612
## 10 2 miss 0.007545411
## # ... with 24,022 more rows
# tease out the data of 20 top terms
top_terms <- chapter_topics %>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
## # A tibble: 80 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 elizabeth 0.014524386
## 2 1 jane 0.011826411
## 3 1 darcy 0.007627316
## 4 1 wickham 0.007085701
## 5 1 sister 0.007056486
## 6 1 time 0.006925121
## 7 1 bennet 0.006536751
## 8 1 dear 0.006252631
## 9 1 lydia 0.006082345
## 10 1 letter 0.005896662
## # ... with 70 more rows
# Plot the 20 top terms per topic
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
# organize the data for topic proportion per document
doc_gamma <- tidy(chapters_lda, matrix = "gamma")
doc_gamma
## # A tibble: 244 x 3
## document topic gamma
## <chr> <int> <dbl>
## 1 Pride and Prejudice_43 1 1.950564e-05
## 2 Pride and Prejudice_18 1 1.729715e-05
## 3 Pride and Prejudice_45 1 4.812399e-05
## 4 Pride and Prejudice_16 1 2.671750e-05
## 5 Pride and Prejudice_29 1 3.500110e-05
## 6 Pride and Prejudice_10 1 3.986573e-05
## 7 Pride and Prejudice_8 1 4.330501e-05
## 8 Pride and Prejudice_56 1 3.314398e-05
## 9 Pride and Prejudice_47 1 9.999259e-01
## 10 Pride and Prejudice_55 1 9.998826e-01
## # ... with 234 more rows
# give a chapter number to the doc_gamma data frame
doc_gamma.df <- data.frame(doc_gamma)
doc_gamma.df$chapter <- rep(1:dim(chapters_dtm)[1],k)
ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma, group = factor(topic), color = factor(topic)))+
geom_line()+
facet_wrap(~factor(topic), ncol = 1)
Based on the reference from https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html, there are four indices of topic modeling result.
result <- FindTopicsNumber(
chapters_dtm,
topics = seq(from = 2, to = 60, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 2L,
verbose = TRUE)
## fit models... done.
## calculate metrics:
## Griffiths2004... done.
## CaoJuan2009... done.
## Arun2010... done.
## Deveaud2014... done.
FindTopicsNumber_plot(result)