First commit of the Topic Models Julia package

Jonathan Chang · Jonathan Chang · commit c645835a6f37 · 2013-03-16T23:15:27.000-07:00
diff --git a/README.md b/README.md
@@ -0,0 +1,77 @@
+# Topic Models for Julia
+
+Topic models are Bayesian, hierarchical mixture models of discrete data.  
+This package implements utilities for reading and manipulating data commonly 
+associated with topic models as well as inference and prediction procedures
+for such models.
+
+## Model description
+
+The bulk of the package is designed for a particular topic model, Latent 
+Dirichlet Allocation (LDA, Blei et al., 2003).  This model assumes a corpus
+composed of a collection of bags of words; each bag of words is termed a
+document.  The space whence the words are drawn is termed the lexicon.
+
+Formally, the model is defined as
+
+  For each topic k,
+    phi_k ~ Dirichlet(beta)
+  For each document d,
+    theta ~ Dirichlet(alpha)
+    For each word w,
+      z ~ Multinomial(theta)
+      w ~ Multinomial(phi_z)
+
+alpha and beta are hyperparameters of the model.  The number of topics, K,
+is a fixed parameter of the model, and w is observed.  This package fits 
+the topics using collapsed Gibbs sampling (Griffiths and Steyvers, 2004).
+
+## Package usage
+
+We describe the functions of the package using an example. First we load 
+corpora from data files as follows:
+
+  testDocuments = readDocuments(open("cora.documents"))
+  testLexicon = readLexicon(open("cora.lexicon"))
+
+These read files in LDA-C format.  The lexicon file is assumed to have one
+word per line.  The document file consists of one document per line.  Each
+document consists of a collection of tuples; the first element of each tuple
+expresses the word while the second element expresses the number of times
+that word appears in the document.  The words are indicated by an index 
+into the lexicon into the lexicon file, starting at zero.  The tuples are
+separated by spaces and the entire line is prefixed by a number indicating
+the number of tuples for that document.
+
+With the documents loaded, we instantiate a model that we want to train:
+
+  model = Model(fill(0.1, 10), 0.01, length(testLexicon), testDocuments)
+
+This is a model with 10 topics.  alpha is set to a uniform Dirichlet prior
+with 0.1 weight on each topic (the dimension of this variable is used
+to determine the number of topics).  The second parameter indicates that
+the prior weight on phi (i.e. beta) should be set to 0.01.  The third
+parameter is the lexicon size; here we just use the lexicon we have 
+just read.  The fourth parameter is the collection of documents.
+
+  trainModel(testDocuments, model, 30)
+
+With the model defined, we can train the model on a corpus of documents.
+The trainModel command takes the corpus as the first argument, the model
+as the second argument, and the number of iterations of collapsed Gibbs
+samplign to perform as the third argument.  The contents of the model
+will be mutated in place.
+
+Finally we can examine the output of the trained model using topTopicWords.
+
+  topWords = topTopicWords(model, testLexicon, 10)
+
+This function retrieves the top words associated with each topic; this
+serves as a useful summary of the model.  The first parameter is the model,
+the second is the lexicon backing the corpus, and the third parameter
+is the number of words to retrieve for each topic.  The output is an array
+of arrays of the words in sorted order of prevalence in the topic.
+
+## See also
+The R package whence much of this code was derived at 
+https://github.com/slycoder/R-lda-deprecated.
diff --git a/REQUIRE b/REQUIRE
diff --git a/src/TopicModels.jl b/src/TopicModels.jl
@@ -0,0 +1,146 @@
+module TopicModels
+
+typealias RaggedMatrix{T} Array{Array{Int64,1},1}
+typealias Corpus RaggedMatrix{Int64}
+
+type Model
+  alphaPrior::Array{Float64,1}
+  betaPrior::Float64
+  topics::Array{Int64,2}
+  topicSums::Array{Int64,1}
+  documentSums::Array{Int64,2}
+  assignments::RaggedMatrix{Int64}
+
+  Model(alphaPrior::Array{Float64,1}, 
+        betaPrior::Float64, 
+        V::Int64, 
+        corpus::Corpus) = begin
+    K = length(alphaPrior)
+    m = new(
+      alphaPrior,
+      betaPrior,
+      zeros(Int64, K, V), # topics
+      zeros(Int64, K), # topicSums
+      zeros(Int64, K, length(corpus)), #documentSums
+      fill(Array(Int64, 0), length(corpus)) # assignments
+    )
+    for dd in 1:length(corpus)
+      m.assignments[dd] = fill(0, length(corpus[dd])) 
+      for ww in 1:length(corpus[dd])
+        word = corpus[dd][ww]
+        topic = sampleMultinomial(alphaPrior)
+        m.assignments[dd][ww] = topic
+        updateSufficientStatistics(word, topic, dd, 1, m)
+      end
+    end
+    return m
+  end
+end
+
+function sampleMultinomial(p::Array{Float64,1})
+  pSum = sum(p)
+  r = rand() * pSum
+  K = length(p)
+  for k in 1:K
+    if r < p[k]
+      return k
+    else
+      r -= p[k]
+    end
+  end
+  return 0
+end
+
+function wordDistribution(word::Int,
+                          document::Int,
+                          model::Model)
+  V = size(model.topics, 2)
+  (model.documentSums[1:end,document] + model.alphaPrior) .* 
+    (model.topics[1:end, word] + model.betaPrior) ./ 
+    (model.topicSums + V * model.betaPrior)
+end
+
+function sampleWord(word::Int,
+                    document::Int,
+                    model::Model)
+  p = wordDistribution(word, document, model)
+  sampleMultinomial(p)
+end
+
+
+function updateSufficientStatistics(word::Int, 
+                                    topic::Int,
+                                    document::Int,
+                                    scale::Int, 
+                                    model::Model)
+  model.topics[topic, word] += scale
+  model.topicSums[topic] += scale
+  model.documentSums[topic, document] += scale
+end
+
+function sampleDocument(words::Array{Int64,1},
+                        document::Int,
+                        model::Model) 
+  Nw = length(words)
+  for ii in 1:Nw
+    word = words[ii]
+    oldTopic = model.assignments[document][ii] 
+    updateSufficientStatistics(word, oldTopic, document, -1, model)
+    newTopic = sampleWord(word, document, model)
+    model.assignments[document][ii] = newTopic
+    updateSufficientStatistics(word, newTopic, document, 1, model)
+  end
+end
+
+function sampleCorpus(corpus::Corpus,
+                      model::Model)
+  for ii in 1:length(corpus)
+    sampleDocument(corpus[ii], ii, model)
+  end
+end
+
+# Note, files are zero indexed, but we are 1-indexed.
+function termToWordSequence(term::String)
+  parts = split(term, ":")
+  fill(int64(parts[1]) + 1, int64(parts[2]))
+end 
+
+# The functions below are designed for public consumption
+function trainModel(corpus::Corpus,
+                    model::Model, 
+                    numIterations::Int64)
+  for ii in 1:numIterations
+    println(string("Iteration ", ii, "..."))
+    sampleCorpus(corpus, model)
+  end
+end
+
+function topTopicWords(model::Model,
+                       lexicon::Array{ASCIIString,1},
+                       numWords::Int64)
+  [lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]]
+   for row in 1:size(model.topics,1)]
+end
+
+function readDocuments(stream)
+  lines = readlines(stream)
+  convert(
+    RaggedMatrix{Int64},
+    [apply(vcat, [termToWordSequence(term) for term in split(line, " ")[2:end]])
+     for line in lines])
+end
+
+function readLexicon(stream)
+  lines = readlines(stream)
+  map(chomp, convert(Array{String,1}, lines))
+end
+
+# Test stuff
+testDocuments = readDocuments(open("cora.documents"))
+testLexicon = readLexicon(open("cora.lexicon"))
+model = Model(fill(0.1, 10), 0.01, length(testLexicon), testDocuments)
+trainModel(testDocuments, model, 30)
+
+topWords = topTopicWords(model, testLexicon, 21)
+
+end