1
1
module TopicModels
2
2
3
- typealias RaggedMatrix{T} Array{Array{Int64,1 },1 }
4
- typealias Corpus RaggedMatrix{Int64}
3
+ import Base. length
4
+
5
+ typealias RaggedMatrix{T} Array{Array{T,1 },1 }
6
+
7
+ type Corpus
8
+ documents:: RaggedMatrix{Int64}
9
+ weights:: RaggedMatrix{Float64}
10
+
11
+ Corpus (documents:: RaggedMatrix{Int64} ,
12
+ weights:: RaggedMatrix{Float64} ) = begin
13
+ return new (
14
+ documents,
15
+ weights
16
+ )
17
+ end
18
+
19
+ Corpus (documents:: RaggedMatrix{Int64} ) = begin
20
+ weights = map (documents) do doc
21
+ ones (Float64, length (doc))
22
+ end
23
+ return new (
24
+ documents,
25
+ weights
26
+ )
27
+ end
28
+ end
5
29
6
30
type Model
7
- alphaPrior:: Array {Float64,1 }
31
+ alphaPrior:: Vector {Float64}
8
32
betaPrior:: Float64
9
- topics:: Array{Int64 ,2}
10
- topicSums:: Array{Int64,1 }
11
- documentSums:: Array{Int64 ,2}
33
+ topics:: Array{Float64 ,2}
34
+ topicSums:: Vector{Float64 }
35
+ documentSums:: Array{Float64 ,2}
12
36
assignments:: RaggedMatrix{Int64}
37
+ frozen:: Bool
38
+ corpus:: Corpus
13
39
14
- Model (alphaPrior:: Array {Float64,1 } ,
40
+ Model (alphaPrior:: Vector {Float64} ,
15
41
betaPrior:: Float64 ,
16
42
V:: Int64 ,
17
43
corpus:: Corpus ) = begin
18
44
K = length (alphaPrior)
19
45
m = new (
20
46
alphaPrior,
21
47
betaPrior,
22
- zeros (Int64, K, V), # topics
23
- zeros (Int64, K), # topicSums
24
- zeros (Int64, K, length (corpus)), # documentSums
25
- fill (Array (Int64, 0 ), length (corpus)) # assignments
48
+ zeros (Float64, K, V), # topics
49
+ zeros (Float64, K), # topicSums
50
+ zeros (Float64, K, length (corpus. documents)), # documentSums
51
+ fill (Array (Int64, 0 ), length (corpus. documents)), # assignments
52
+ false ,
53
+ corpus
26
54
)
27
- for dd in 1 : length (corpus)
28
- m. assignments[dd] = fill (0 , length (corpus[dd]))
29
- for ww in 1 : length (corpus[dd])
30
- word = corpus[dd][ww]
31
- topic = sampleMultinomial (alphaPrior)
32
- m. assignments[dd][ww] = topic
33
- updateSufficientStatistics (word, topic, dd, 1 , m)
34
- end
35
- end
55
+ initializeAssignments (m)
56
+ return m
57
+ end
58
+
59
+ Model (trainedModel:: Model , corpus:: Corpus ) = begin
60
+ m = new (
61
+ trainedModel. alphaPrior,
62
+ trainedModel. betaPrior,
63
+ trainedModel. topics,
64
+ trainedModel. topicSums,
65
+ trainedModel. documentSums,
66
+ fill (Array (Int64, 0 ), length (corpus. documents)),
67
+ true ,
68
+ corpus
69
+ )
70
+ initializeAssignments (m)
36
71
return m
37
72
end
38
73
end
39
74
75
+ function length (corpus:: Corpus )
76
+ return length (corpus. documents)
77
+ end
78
+
79
+ function initializeAssignments (model:: Model )
80
+ for dd in 1 : length (model. corpus)
81
+ model. assignments[dd] = fill (0 , length (model. corpus. documents[dd]))
82
+ for ww in 1 : length (model. corpus. documents[dd])
83
+ word = model. corpus. documents[dd][ww]
84
+ topic = sampleMultinomial (model. alphaPrior)
85
+ model. assignments[dd][ww] = topic
86
+ updateSufficientStatistics (
87
+ word, topic, dd, model. corpus. weights[dd][ww], model)
88
+ end
89
+ end
90
+ end
91
+
40
92
function sampleMultinomial (p:: Array{Float64,1} )
41
93
pSum = sum (p)
42
94
r = rand () * pSum
53
105
54
106
function wordDistribution (word:: Int ,
55
107
document:: Int ,
56
- model:: Model )
108
+ model:: Model ,
109
+ out:: Vector{Float64} )
57
110
V = size (model. topics, 2 )
58
- (model. documentSums[1 : end ,document] + model. alphaPrior) .*
59
- (model. topics[1 : end , word] + model. betaPrior) ./
60
- (model. topicSums + V * model. betaPrior)
111
+ for ii in 1 : length (out)
112
+ out[ii] = (model. documentSums[ii, document] + model. alphaPrior[ii]) *
113
+ (model. topics[ii, word] + model. betaPrior) /
114
+ (model. topicSums[ii] + V * model. betaPrior)
115
+ end
116
+ return out
61
117
end
62
118
63
119
function sampleWord (word:: Int ,
64
120
document:: Int ,
65
- model:: Model )
66
- p = wordDistribution (word, document, model)
121
+ model:: Model ,
122
+ p:: Vector{Float64} )
123
+ wordDistribution (word, document, model, p)
67
124
sampleMultinomial (p)
68
125
end
69
126
70
127
71
- function updateSufficientStatistics (word:: Int ,
72
- topic:: Int ,
73
- document:: Int ,
74
- scale:: Int ,
128
+ function updateSufficientStatistics (word:: Int64 ,
129
+ topic:: Int64 ,
130
+ document:: Int64 ,
131
+ scale:: Float64 ,
75
132
model:: Model )
76
- model. topics[topic, word] += scale
77
- model. topicSums[topic] += scale
78
133
model. documentSums[topic, document] += scale
134
+ model. topicSums[topic] += scale * ! model. frozen
135
+ model. topics[topic, word] += scale * ! model. frozen
79
136
end
80
137
81
- function sampleDocument (words :: Array{Int64,1} ,
82
- document :: Int ,
83
- model:: Model )
138
+ function sampleDocument (document :: Int ,
139
+ model :: Model )
140
+ words = model. corpus . documents[document]
84
141
Nw = length (words)
142
+ weights = model. corpus. weights[document]
143
+ K = length (model. alphaPrior)
144
+ p = Array (Float64, K)
85
145
for ii in 1 : Nw
86
146
word = words[ii]
87
147
oldTopic = model. assignments[document][ii]
88
- updateSufficientStatistics (word, oldTopic, document, - 1 , model)
89
- newTopic = sampleWord (word, document, model)
148
+ updateSufficientStatistics (word, oldTopic, document, - weights[ii] , model)
149
+ newTopic:: Int64 = sampleWord (word, document, model, p )
90
150
model. assignments[document][ii] = newTopic
91
- updateSufficientStatistics (word, newTopic, document, 1 , model)
151
+ updateSufficientStatistics (word, newTopic, document, weights[ii] , model)
92
152
end
93
153
end
94
154
95
- function sampleCorpus (corpus:: Corpus ,
96
- model:: Model )
97
- for ii in 1 : length (corpus)
98
- sampleDocument (corpus[ii], ii, model)
155
+ function sampleCorpus (model:: Model )
156
+ for ii in 1 : length (model. corpus)
157
+ sampleDocument (ii, model)
99
158
end
100
159
end
101
160
@@ -106,12 +165,11 @@ function termToWordSequence(term::String)
106
165
end
107
166
108
167
# The functions below are designed for public consumption
109
- function trainModel (corpus:: Corpus ,
110
- model:: Model ,
168
+ function trainModel (model:: Model ,
111
169
numIterations:: Int64 )
112
170
for ii in 1 : numIterations
113
171
println (string (" Iteration " , ii, " ..." ))
114
- sampleCorpus (corpus, model)
172
+ sampleCorpus (model)
115
173
end
116
174
end
117
175
0 commit comments