Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 5bb9282

Browse files
author
Jonathan Chang
committed
Improve performance substantially. Include support for weights
1 parent 71dc35a commit 5bb9282

File tree

1 file changed

+103
-45
lines changed

1 file changed

+103
-45
lines changed

src/TopicModels.jl

Lines changed: 103 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,94 @@
11
module TopicModels
22

3-
typealias RaggedMatrix{T} Array{Array{Int64,1},1}
4-
typealias Corpus RaggedMatrix{Int64}
3+
import Base.length
4+
5+
typealias RaggedMatrix{T} Array{Array{T,1},1}
6+
7+
type Corpus
8+
documents::RaggedMatrix{Int64}
9+
weights::RaggedMatrix{Float64}
10+
11+
Corpus(documents::RaggedMatrix{Int64},
12+
weights::RaggedMatrix{Float64}) = begin
13+
return new(
14+
documents,
15+
weights
16+
)
17+
end
18+
19+
Corpus(documents::RaggedMatrix{Int64}) = begin
20+
weights = map(documents) do doc
21+
ones(Float64, length(doc))
22+
end
23+
return new(
24+
documents,
25+
weights
26+
)
27+
end
28+
end
529

630
type Model
7-
alphaPrior::Array{Float64,1}
31+
alphaPrior::Vector{Float64}
832
betaPrior::Float64
9-
topics::Array{Int64,2}
10-
topicSums::Array{Int64,1}
11-
documentSums::Array{Int64,2}
33+
topics::Array{Float64,2}
34+
topicSums::Vector{Float64}
35+
documentSums::Array{Float64,2}
1236
assignments::RaggedMatrix{Int64}
37+
frozen::Bool
38+
corpus::Corpus
1339

14-
Model(alphaPrior::Array{Float64,1},
40+
Model(alphaPrior::Vector{Float64},
1541
betaPrior::Float64,
1642
V::Int64,
1743
corpus::Corpus) = begin
1844
K = length(alphaPrior)
1945
m = new(
2046
alphaPrior,
2147
betaPrior,
22-
zeros(Int64, K, V), # topics
23-
zeros(Int64, K), # topicSums
24-
zeros(Int64, K, length(corpus)), #documentSums
25-
fill(Array(Int64, 0), length(corpus)) # assignments
48+
zeros(Float64, K, V), # topics
49+
zeros(Float64, K), # topicSums
50+
zeros(Float64, K, length(corpus.documents)), #documentSums
51+
fill(Array(Int64, 0), length(corpus.documents)), # assignments
52+
false,
53+
corpus
2654
)
27-
for dd in 1:length(corpus)
28-
m.assignments[dd] = fill(0, length(corpus[dd]))
29-
for ww in 1:length(corpus[dd])
30-
word = corpus[dd][ww]
31-
topic = sampleMultinomial(alphaPrior)
32-
m.assignments[dd][ww] = topic
33-
updateSufficientStatistics(word, topic, dd, 1, m)
34-
end
35-
end
55+
initializeAssignments(m)
56+
return m
57+
end
58+
59+
Model(trainedModel::Model, corpus::Corpus) = begin
60+
m = new(
61+
trainedModel.alphaPrior,
62+
trainedModel.betaPrior,
63+
trainedModel.topics,
64+
trainedModel.topicSums,
65+
trainedModel.documentSums,
66+
fill(Array(Int64, 0), length(corpus.documents)),
67+
true,
68+
corpus
69+
)
70+
initializeAssignments(m)
3671
return m
3772
end
3873
end
3974

75+
function length(corpus::Corpus)
76+
return length(corpus.documents)
77+
end
78+
79+
function initializeAssignments(model::Model)
80+
for dd in 1:length(model.corpus)
81+
model.assignments[dd] = fill(0, length(model.corpus.documents[dd]))
82+
for ww in 1:length(model.corpus.documents[dd])
83+
word = model.corpus.documents[dd][ww]
84+
topic = sampleMultinomial(model.alphaPrior)
85+
model.assignments[dd][ww] = topic
86+
updateSufficientStatistics(
87+
word, topic, dd, model.corpus.weights[dd][ww], model)
88+
end
89+
end
90+
end
91+
4092
function sampleMultinomial(p::Array{Float64,1})
4193
pSum = sum(p)
4294
r = rand() * pSum
@@ -53,49 +105,56 @@ end
53105

54106
function wordDistribution(word::Int,
55107
document::Int,
56-
model::Model)
108+
model::Model,
109+
out::Vector{Float64})
57110
V = size(model.topics, 2)
58-
(model.documentSums[1:end,document] + model.alphaPrior) .*
59-
(model.topics[1:end, word] + model.betaPrior) ./
60-
(model.topicSums + V * model.betaPrior)
111+
for ii in 1:length(out)
112+
out[ii] = (model.documentSums[ii, document] + model.alphaPrior[ii]) *
113+
(model.topics[ii, word] + model.betaPrior) /
114+
(model.topicSums[ii] + V * model.betaPrior)
115+
end
116+
return out
61117
end
62118

63119
function sampleWord(word::Int,
64120
document::Int,
65-
model::Model)
66-
p = wordDistribution(word, document, model)
121+
model::Model,
122+
p::Vector{Float64})
123+
wordDistribution(word, document, model, p)
67124
sampleMultinomial(p)
68125
end
69126

70127

71-
function updateSufficientStatistics(word::Int,
72-
topic::Int,
73-
document::Int,
74-
scale::Int,
128+
function updateSufficientStatistics(word::Int64,
129+
topic::Int64,
130+
document::Int64,
131+
scale::Float64,
75132
model::Model)
76-
model.topics[topic, word] += scale
77-
model.topicSums[topic] += scale
78133
model.documentSums[topic, document] += scale
134+
model.topicSums[topic] += scale * !model.frozen
135+
model.topics[topic, word] += scale * !model.frozen
79136
end
80137

81-
function sampleDocument(words::Array{Int64,1},
82-
document::Int,
83-
model::Model)
138+
function sampleDocument(document::Int,
139+
model::Model)
140+
words = model.corpus.documents[document]
84141
Nw = length(words)
142+
weights = model.corpus.weights[document]
143+
K = length(model.alphaPrior)
144+
p = Array(Float64, K)
85145
for ii in 1:Nw
86146
word = words[ii]
87147
oldTopic = model.assignments[document][ii]
88-
updateSufficientStatistics(word, oldTopic, document, -1, model)
89-
newTopic = sampleWord(word, document, model)
148+
updateSufficientStatistics(word, oldTopic, document, -weights[ii], model)
149+
newTopic::Int64 = sampleWord(word, document, model, p)
90150
model.assignments[document][ii] = newTopic
91-
updateSufficientStatistics(word, newTopic, document, 1, model)
151+
updateSufficientStatistics(word, newTopic, document, weights[ii], model)
92152
end
93153
end
94154

95-
function sampleCorpus(corpus::Corpus,
96-
model::Model)
97-
for ii in 1:length(corpus)
98-
sampleDocument(corpus[ii], ii, model)
155+
function sampleCorpus(model::Model)
156+
for ii in 1:length(model.corpus)
157+
sampleDocument(ii, model)
99158
end
100159
end
101160

@@ -106,12 +165,11 @@ function termToWordSequence(term::String)
106165
end
107166

108167
# The functions below are designed for public consumption
109-
function trainModel(corpus::Corpus,
110-
model::Model,
168+
function trainModel(model::Model,
111169
numIterations::Int64)
112170
for ii in 1:numIterations
113171
println(string("Iteration ", ii, "..."))
114-
sampleCorpus(corpus, model)
172+
sampleCorpus(model)
115173
end
116174
end
117175

0 commit comments

Comments
 (0)