[MRG+2] switch to multinomial composition for mixture sampling (scikit-learn#7702)

lesteve · web-flow · commit ad6f0941d385 · 2016-10-20T22:27:19.000+02:00
* switch to multinomial composition for mixture sampling

* add shape assertions to test

* Use n_components=3 to test actual regression

n_components and n_features were equal and one was used for the other in
some places.
diff --git a/sklearn/mixture/base.py b/sklearn/mixture/base.py
@@ -385,7 +385,7 @@ def sample(self, n_samples=1):
 
         _, n_features = self.means_.shape
         rng = check_random_state(self.random_state)
-        n_samples_comp = np.round(self.weights_ * n_samples).astype(int)
+        n_samples_comp = rng.multinomial(n_samples, self.weights_)
 
         if self.covariance_type == 'full':
             X = np.vstack([
diff --git a/sklearn/mixture/tests/test_gaussian_mixture.py b/sklearn/mixture/tests/test_gaussian_mixture.py
@@ -918,7 +918,7 @@ def test_property():
 
 def test_sample():
     rng = np.random.RandomState(0)
-    rand_data = RandomData(rng, scale=7)
+    rand_data = RandomData(rng, scale=7, n_components=3)
     n_features, n_components = rand_data.n_features, rand_data.n_components
 
     for covar_type in COVARIANCE_TYPE:
@@ -935,8 +935,10 @@ def test_sample():
                              gmm.sample, 0)
 
         # Just to make sure the class samples correctly
-        X_s, y_s = gmm.sample(20000)
-        for k in range(n_features):
+        n_samples = 20000
+        X_s, y_s = gmm.sample(n_samples)
+
+        for k in range(n_components):
             if covar_type == 'full':
                 assert_array_almost_equal(gmm.covariances_[k],
                                           np.cov(X_s[y_s == k].T), decimal=1)
@@ -953,9 +955,17 @@ def test_sample():
                     decimal=1)
 
         means_s = np.array([np.mean(X_s[y_s == k], 0)
-                           for k in range(n_features)])
+                           for k in range(n_components)])
         assert_array_almost_equal(gmm.means_, means_s, decimal=1)
 
+        # Check shapes of sampled data, see
+        # https://github.com/scikit-learn/scikit-learn/issues/7701
+        assert_equal(X_s.shape, (n_samples, n_features))
+
+        for sample_size in range(1, 100):
+            X_s, _ = gmm.sample(sample_size)
+            assert_equal(X_s.shape, (sample_size, n_features))
+
 
 @ignore_warnings(category=ConvergenceWarning)
 def test_init():