Add a fast pairwise computation scheme. Also add a test/demo.

Jonathan Chang · Jonathan Chang · commit 3f064a7782de · 2011-09-02T22:07:45.000-04:00
diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE
@@ -1,4 +1,5 @@
 import(methods)
 importClassesFrom(Rcpp, "C++Object", "C++Class", "Module")
 useDynLib(Rflim)
-export(Rflim)
+export(Flim)
+export(count.pairs)
diff --git a/pkg/R/Flim.R b/pkg/R/Flim.R
@@ -0,0 +1,28 @@
+Flim <- function(singleton.counts,
+                 pairwise.counts,
+                 document.count,
+                 beta.1 = 0.1,
+                 beta.2 = 0.1,
+                 num.iterations = 15) {
+  N <- length(singleton.counts)
+
+  if (max(pairwise.counts[,1]) > N ||
+      min(pairwise.counts[,1]) < 0 ||
+      max(pairwise.counts[,2]) > N ||
+      min(pairwise.counts[,2]) < 0) {
+    stop("Atrocity!  Pairwise count indices must be between 1 and N.");
+  }
+
+  flim.obj <- new(.module$Flim, N, beta.1, beta.2)
+  
+
+  flim.obj$loadCorpus(singleton.counts, 
+                      pairwise.counts[,1],
+                      pairwise.counts[,2],
+                      pairwise.counts[,3],
+                      document.count)
+  for (ii in 1:num.iterations) {
+    flim.obj$optimizeAll()
+  }
+  return(flim.obj)
+}
diff --git a/pkg/R/count.pairs.R b/pkg/R/count.pairs.R
@@ -0,0 +1,20 @@
+# This function takes an lda document structure and returns a sparse matrix
+# giving counts of how many documents in which each pair of words co-occur.
+count.pairs <- function(documents) {
+  ## documents is uniquified per document already, so we can ignore the counts.
+  w <- lapply(documents, function(x) x[1,])
+  ## infer the size of the matrix from the  documents
+  V <- max(unlist(w)) + 1L
+  M <- Matrix(0, V, V)
+
+  ## create a list giving the outer product indices for each document.
+  w.pairs <- lapply(w, function(ww) {
+    cbind(rep(ww + 1L, length(ww)), 
+          rep(ww + 1L, each=length(ww)))
+  })
+
+  ## cross tabulate to get counts.
+  M <- xtabs(~ X1 + X2,
+             data.frame(do.call(rbind, w.pairs)), 
+             sparse=T)
+}
diff --git a/pkg/src/flim.cpp b/pkg/src/flim.cpp
@@ -37,10 +37,10 @@ class Flim {
                        singleton_expectation_(N),
                        beta1_(beta1),
                        beta2_(beta2),
-                       empirical_pair_(N),
+                       empirical_pair_(N, N),
                        empirical_singleton_(N) {
-    gsl_matrix_set_zero(lambda_);
-    gsl_vector_set_zero(ones_);
+    gsl_matrix_float_set_zero(lambda_);
+    gsl_vector_float_set_zero(ones_);
   }
 
   ~Flim() {
@@ -61,13 +61,13 @@ class Flim {
   // (note that lambda_{i, i} = 0)
   unsigned int estimateExpectations() {
     // estimates_{x,y} = lambda_{x,y}
-    gsl_matrix_memcpy(estimates_, lambda_);
+    gsl_matrix_float_memcpy(estimates_, lambda_);
     // estimates_{x,y} = lambda_{x,y} + kappa_x
     gsl_blas_sger(1.0, ones_, kappa_, estimates_);
     // estimates_{x,y} = lambda_{x,y} + kappa_x + kappa_y
     gsl_blas_sger(1.0, kappa_, ones_, estimates_);
 
-    gsl_vector_set_zero(q_lambda_);
+    gsl_vector_float_set_zero(q_lambda_);
     gsl_blas_sgemv(CblasNoTrans,
                    1.0,
                    lambda_,
@@ -82,10 +82,14 @@ class Flim {
   }
 
   void initializeKappa(int num_documents) {
-    gsl_vector_memcpy(kappa_, empirical_singleton_);
-    gsl_vector_scale(kappa_, num_documents);
-    gsl_vector_add_constant(kappa_, 1.0);
-    gsl_vector_scale(kappa_, 1.0 / (2.0 + num_documents));
+    gsl_vector_float_memcpy(kappa_, empirical_singleton_);
+    gsl_vector_float_scale(kappa_, num_documents);
+    gsl_vector_float_add_constant(kappa_, 1.0);
+    gsl_vector_float_scale(kappa_, 1.0 / (2.0 + num_documents));
+    for (int ii = 0; ii < kappa_.size(); ++ii) {
+      singleton_expectation_[ii] = kappa_[ii];
+      kappa_[ii] = logit(kappa_[ii]);
+    }
   }
 
   void loadCorpus(const std::vector<double>& singleton, 
@@ -97,8 +101,8 @@ class Flim {
       empirical_singleton_[ii] = singleton[ii] / num_documents;
     }
     for (int ii = 0; ii < pair_x.size(); ++ii) {
-      int xx = pair_x[ii];
-      int yy = pair_y[ii];
+      int xx = pair_x[ii] - 1;
+      int yy = pair_y[ii] - 1;
       empirical_pair_(xx, yy) = pair_count[ii] / num_documents;
     }
     initializeKappa(num_documents);
@@ -170,11 +174,19 @@ class Flim {
     lambda_(x,y) = new_lambda;
     lambda_(y,x) = new_lambda;
   }
+
+  RcppGSL::matrix<float> getLambda() {
+    return lambda_;
+  }
 };
 
 RCPP_MODULE(Rflim) {
 	using namespace Rcpp;
 
 	class_<Flim>("Flim")		
-		.method("estimateExpectations", &Flim::estimateExpectations);
+    .constructor<int,double,double>()
+		.method("loadCorpus", &Flim::loadCorpus)
+		.method("optimizeAll", &Flim::optimizeAll)
+		.method("estimateExpectations", &Flim::estimateExpectations)
+    .method("getLambda", &Flim::getLambda);
 }
diff --git a/tests/cora.test.R b/tests/cora.test.R
@@ -0,0 +1,23 @@
+require(lda)
+require(Rflim)
+require(Matrix)
+
+data(cora.documents)
+data(cora.vocab)
+
+counts <- count.pairs(cora.documents)
+singleton.counts <- diag(counts)
+counts <- as(counts, 'dgTMatrix')
+pairwise.counts <- subset(data.frame(
+  i = counts@i + 1L,
+  j = counts@j + 1L,
+  x = counts@x), x > 0 & i < j)
+
+save(pairwise.counts, singleton.counts, file="counts.Rdata")
+
+flim.instance <- Flim(singleton.counts,
+                      pairwise.counts,
+                      length(cora.documents))
+
+lambda <- flim.instance$getLambda()
+save(lambda, file="lambda.Rdata")