Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3f064a7

Browse files
author
Jonathan Chang
committed
Add a fast pairwise computation scheme. Also add a test/demo.
1 parent 6280b73 commit 3f064a7

File tree

5 files changed

+97
-13
lines changed

5 files changed

+97
-13
lines changed

pkg/NAMESPACE

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import(methods)
22
importClassesFrom(Rcpp, "C++Object", "C++Class", "Module")
33
useDynLib(Rflim)
4-
export(Rflim)
4+
export(Flim)
5+
export(count.pairs)

pkg/R/Flim.R

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
Flim <- function(singleton.counts,
2+
pairwise.counts,
3+
document.count,
4+
beta.1 = 0.1,
5+
beta.2 = 0.1,
6+
num.iterations = 15) {
7+
N <- length(singleton.counts)
8+
9+
if (max(pairwise.counts[,1]) > N ||
10+
min(pairwise.counts[,1]) < 0 ||
11+
max(pairwise.counts[,2]) > N ||
12+
min(pairwise.counts[,2]) < 0) {
13+
stop("Atrocity! Pairwise count indices must be between 1 and N.");
14+
}
15+
16+
flim.obj <- new(.module$Flim, N, beta.1, beta.2)
17+
18+
19+
flim.obj$loadCorpus(singleton.counts,
20+
pairwise.counts[,1],
21+
pairwise.counts[,2],
22+
pairwise.counts[,3],
23+
document.count)
24+
for (ii in 1:num.iterations) {
25+
flim.obj$optimizeAll()
26+
}
27+
return(flim.obj)
28+
}

pkg/R/count.pairs.R

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# This function takes an lda document structure and returns a sparse matrix
2+
# giving counts of how many documents in which each pair of words co-occur.
3+
count.pairs <- function(documents) {
4+
## documents is uniquified per document already, so we can ignore the counts.
5+
w <- lapply(documents, function(x) x[1,])
6+
## infer the size of the matrix from the documents
7+
V <- max(unlist(w)) + 1L
8+
M <- Matrix(0, V, V)
9+
10+
## create a list giving the outer product indices for each document.
11+
w.pairs <- lapply(w, function(ww) {
12+
cbind(rep(ww + 1L, length(ww)),
13+
rep(ww + 1L, each=length(ww)))
14+
})
15+
16+
## cross tabulate to get counts.
17+
M <- xtabs(~ X1 + X2,
18+
data.frame(do.call(rbind, w.pairs)),
19+
sparse=T)
20+
}

pkg/src/flim.cpp

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ class Flim {
3737
singleton_expectation_(N),
3838
beta1_(beta1),
3939
beta2_(beta2),
40-
empirical_pair_(N),
40+
empirical_pair_(N, N),
4141
empirical_singleton_(N) {
42-
gsl_matrix_set_zero(lambda_);
43-
gsl_vector_set_zero(ones_);
42+
gsl_matrix_float_set_zero(lambda_);
43+
gsl_vector_float_set_zero(ones_);
4444
}
4545

4646
~Flim() {
@@ -61,13 +61,13 @@ class Flim {
6161
// (note that lambda_{i, i} = 0)
6262
unsigned int estimateExpectations() {
6363
// estimates_{x,y} = lambda_{x,y}
64-
gsl_matrix_memcpy(estimates_, lambda_);
64+
gsl_matrix_float_memcpy(estimates_, lambda_);
6565
// estimates_{x,y} = lambda_{x,y} + kappa_x
6666
gsl_blas_sger(1.0, ones_, kappa_, estimates_);
6767
// estimates_{x,y} = lambda_{x,y} + kappa_x + kappa_y
6868
gsl_blas_sger(1.0, kappa_, ones_, estimates_);
6969

70-
gsl_vector_set_zero(q_lambda_);
70+
gsl_vector_float_set_zero(q_lambda_);
7171
gsl_blas_sgemv(CblasNoTrans,
7272
1.0,
7373
lambda_,
@@ -82,10 +82,14 @@ class Flim {
8282
}
8383

8484
void initializeKappa(int num_documents) {
85-
gsl_vector_memcpy(kappa_, empirical_singleton_);
86-
gsl_vector_scale(kappa_, num_documents);
87-
gsl_vector_add_constant(kappa_, 1.0);
88-
gsl_vector_scale(kappa_, 1.0 / (2.0 + num_documents));
85+
gsl_vector_float_memcpy(kappa_, empirical_singleton_);
86+
gsl_vector_float_scale(kappa_, num_documents);
87+
gsl_vector_float_add_constant(kappa_, 1.0);
88+
gsl_vector_float_scale(kappa_, 1.0 / (2.0 + num_documents));
89+
for (int ii = 0; ii < kappa_.size(); ++ii) {
90+
singleton_expectation_[ii] = kappa_[ii];
91+
kappa_[ii] = logit(kappa_[ii]);
92+
}
8993
}
9094

9195
void loadCorpus(const std::vector<double>& singleton,
@@ -97,8 +101,8 @@ class Flim {
97101
empirical_singleton_[ii] = singleton[ii] / num_documents;
98102
}
99103
for (int ii = 0; ii < pair_x.size(); ++ii) {
100-
int xx = pair_x[ii];
101-
int yy = pair_y[ii];
104+
int xx = pair_x[ii] - 1;
105+
int yy = pair_y[ii] - 1;
102106
empirical_pair_(xx, yy) = pair_count[ii] / num_documents;
103107
}
104108
initializeKappa(num_documents);
@@ -170,11 +174,19 @@ class Flim {
170174
lambda_(x,y) = new_lambda;
171175
lambda_(y,x) = new_lambda;
172176
}
177+
178+
RcppGSL::matrix<float> getLambda() {
179+
return lambda_;
180+
}
173181
};
174182

175183
RCPP_MODULE(Rflim) {
176184
using namespace Rcpp;
177185

178186
class_<Flim>("Flim")
179-
.method("estimateExpectations", &Flim::estimateExpectations);
187+
.constructor<int,double,double>()
188+
.method("loadCorpus", &Flim::loadCorpus)
189+
.method("optimizeAll", &Flim::optimizeAll)
190+
.method("estimateExpectations", &Flim::estimateExpectations)
191+
.method("getLambda", &Flim::getLambda);
180192
}

tests/cora.test.R

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
require(lda)
2+
require(Rflim)
3+
require(Matrix)
4+
5+
data(cora.documents)
6+
data(cora.vocab)
7+
8+
counts <- count.pairs(cora.documents)
9+
singleton.counts <- diag(counts)
10+
counts <- as(counts, 'dgTMatrix')
11+
pairwise.counts <- subset(data.frame(
12+
i = counts@i + 1L,
13+
j = counts@j + 1L,
14+
x = counts@x), x > 0 & i < j)
15+
16+
save(pairwise.counts, singleton.counts, file="counts.Rdata")
17+
18+
flim.instance <- Flim(singleton.counts,
19+
pairwise.counts,
20+
length(cora.documents))
21+
22+
lambda <- flim.instance$getLambda()
23+
save(lambda, file="lambda.Rdata")

0 commit comments

Comments
 (0)