Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c645835

Browse files
author
Jonathan Chang
committed
First commit of the Topic Models Julia package
1 parent 94357fb commit c645835

File tree

3 files changed

+223
-0
lines changed

3 files changed

+223
-0
lines changed

README.md

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Topic Models for Julia
2+
3+
Topic models are Bayesian, hierarchical mixture models of discrete data.
4+
This package implements utilities for reading and manipulating data commonly
5+
associated with topic models as well as inference and prediction procedures
6+
for such models.
7+
8+
## Model description
9+
10+
The bulk of the package is designed for a particular topic model, Latent
11+
Dirichlet Allocation (LDA, Blei et al., 2003). This model assumes a corpus
12+
composed of a collection of bags of words; each bag of words is termed a
13+
document. The space whence the words are drawn is termed the lexicon.
14+
15+
Formally, the model is defined as
16+
17+
For each topic k,
18+
phi_k ~ Dirichlet(beta)
19+
For each document d,
20+
theta ~ Dirichlet(alpha)
21+
For each word w,
22+
z ~ Multinomial(theta)
23+
w ~ Multinomial(phi_z)
24+
25+
alpha and beta are hyperparameters of the model. The number of topics, K,
26+
is a fixed parameter of the model, and w is observed. This package fits
27+
the topics using collapsed Gibbs sampling (Griffiths and Steyvers, 2004).
28+
29+
## Package usage
30+
31+
We describe the functions of the package using an example. First we load
32+
corpora from data files as follows:
33+
34+
testDocuments = readDocuments(open("cora.documents"))
35+
testLexicon = readLexicon(open("cora.lexicon"))
36+
37+
These read files in LDA-C format. The lexicon file is assumed to have one
38+
word per line. The document file consists of one document per line. Each
39+
document consists of a collection of tuples; the first element of each tuple
40+
expresses the word while the second element expresses the number of times
41+
that word appears in the document. The words are indicated by an index
42+
into the lexicon into the lexicon file, starting at zero. The tuples are
43+
separated by spaces and the entire line is prefixed by a number indicating
44+
the number of tuples for that document.
45+
46+
With the documents loaded, we instantiate a model that we want to train:
47+
48+
model = Model(fill(0.1, 10), 0.01, length(testLexicon), testDocuments)
49+
50+
This is a model with 10 topics. alpha is set to a uniform Dirichlet prior
51+
with 0.1 weight on each topic (the dimension of this variable is used
52+
to determine the number of topics). The second parameter indicates that
53+
the prior weight on phi (i.e. beta) should be set to 0.01. The third
54+
parameter is the lexicon size; here we just use the lexicon we have
55+
just read. The fourth parameter is the collection of documents.
56+
57+
trainModel(testDocuments, model, 30)
58+
59+
With the model defined, we can train the model on a corpus of documents.
60+
The trainModel command takes the corpus as the first argument, the model
61+
as the second argument, and the number of iterations of collapsed Gibbs
62+
samplign to perform as the third argument. The contents of the model
63+
will be mutated in place.
64+
65+
Finally we can examine the output of the trained model using topTopicWords.
66+
67+
topWords = topTopicWords(model, testLexicon, 10)
68+
69+
This function retrieves the top words associated with each topic; this
70+
serves as a useful summary of the model. The first parameter is the model,
71+
the second is the lexicon backing the corpus, and the third parameter
72+
is the number of words to retrieve for each topic. The output is an array
73+
of arrays of the words in sorted order of prevalence in the topic.
74+
75+
## See also
76+
The R package whence much of this code was derived at
77+
https://github.com/slycoder/R-lda-deprecated.

REQUIRE

Whitespace-only changes.

src/TopicModels.jl

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
module TopicModels
2+
3+
typealias RaggedMatrix{T} Array{Array{Int64,1},1}
4+
typealias Corpus RaggedMatrix{Int64}
5+
6+
type Model
7+
alphaPrior::Array{Float64,1}
8+
betaPrior::Float64
9+
topics::Array{Int64,2}
10+
topicSums::Array{Int64,1}
11+
documentSums::Array{Int64,2}
12+
assignments::RaggedMatrix{Int64}
13+
14+
Model(alphaPrior::Array{Float64,1},
15+
betaPrior::Float64,
16+
V::Int64,
17+
corpus::Corpus) = begin
18+
K = length(alphaPrior)
19+
m = new(
20+
alphaPrior,
21+
betaPrior,
22+
zeros(Int64, K, V), # topics
23+
zeros(Int64, K), # topicSums
24+
zeros(Int64, K, length(corpus)), #documentSums
25+
fill(Array(Int64, 0), length(corpus)) # assignments
26+
)
27+
for dd in 1:length(corpus)
28+
m.assignments[dd] = fill(0, length(corpus[dd]))
29+
for ww in 1:length(corpus[dd])
30+
word = corpus[dd][ww]
31+
topic = sampleMultinomial(alphaPrior)
32+
m.assignments[dd][ww] = topic
33+
updateSufficientStatistics(word, topic, dd, 1, m)
34+
end
35+
end
36+
return m
37+
end
38+
end
39+
40+
function sampleMultinomial(p::Array{Float64,1})
41+
pSum = sum(p)
42+
r = rand() * pSum
43+
K = length(p)
44+
for k in 1:K
45+
if r < p[k]
46+
return k
47+
else
48+
r -= p[k]
49+
end
50+
end
51+
return 0
52+
end
53+
54+
function wordDistribution(word::Int,
55+
document::Int,
56+
model::Model)
57+
V = size(model.topics, 2)
58+
(model.documentSums[1:end,document] + model.alphaPrior) .*
59+
(model.topics[1:end, word] + model.betaPrior) ./
60+
(model.topicSums + V * model.betaPrior)
61+
end
62+
63+
function sampleWord(word::Int,
64+
document::Int,
65+
model::Model)
66+
p = wordDistribution(word, document, model)
67+
sampleMultinomial(p)
68+
end
69+
70+
71+
function updateSufficientStatistics(word::Int,
72+
topic::Int,
73+
document::Int,
74+
scale::Int,
75+
model::Model)
76+
model.topics[topic, word] += scale
77+
model.topicSums[topic] += scale
78+
model.documentSums[topic, document] += scale
79+
end
80+
81+
function sampleDocument(words::Array{Int64,1},
82+
document::Int,
83+
model::Model)
84+
Nw = length(words)
85+
for ii in 1:Nw
86+
word = words[ii]
87+
oldTopic = model.assignments[document][ii]
88+
updateSufficientStatistics(word, oldTopic, document, -1, model)
89+
newTopic = sampleWord(word, document, model)
90+
model.assignments[document][ii] = newTopic
91+
updateSufficientStatistics(word, newTopic, document, 1, model)
92+
end
93+
end
94+
95+
function sampleCorpus(corpus::Corpus,
96+
model::Model)
97+
for ii in 1:length(corpus)
98+
sampleDocument(corpus[ii], ii, model)
99+
end
100+
end
101+
102+
# Note, files are zero indexed, but we are 1-indexed.
103+
function termToWordSequence(term::String)
104+
parts = split(term, ":")
105+
fill(int64(parts[1]) + 1, int64(parts[2]))
106+
end
107+
108+
# The functions below are designed for public consumption
109+
function trainModel(corpus::Corpus,
110+
model::Model,
111+
numIterations::Int64)
112+
for ii in 1:numIterations
113+
println(string("Iteration ", ii, "..."))
114+
sampleCorpus(corpus, model)
115+
end
116+
end
117+
118+
function topTopicWords(model::Model,
119+
lexicon::Array{ASCIIString,1},
120+
numWords::Int64)
121+
[lexicon[reverse(sortperm(model.topics'[1:end, row]))[1:numWords]]
122+
for row in 1:size(model.topics,1)]
123+
end
124+
125+
function readDocuments(stream)
126+
lines = readlines(stream)
127+
convert(
128+
RaggedMatrix{Int64},
129+
[apply(vcat, [termToWordSequence(term) for term in split(line, " ")[2:end]])
130+
for line in lines])
131+
end
132+
133+
function readLexicon(stream)
134+
lines = readlines(stream)
135+
map(chomp, convert(Array{String,1}, lines))
136+
end
137+
138+
# Test stuff
139+
testDocuments = readDocuments(open("cora.documents"))
140+
testLexicon = readLexicon(open("cora.lexicon"))
141+
model = Model(fill(0.1, 10), 0.01, length(testLexicon), testDocuments)
142+
trainModel(testDocuments, model, 30)
143+
144+
topWords = topTopicWords(model, testLexicon, 21)
145+
146+
end

0 commit comments

Comments
 (0)