From 571f4afc5321d831ddadb158259762ca1e23bf85 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 21 Jun 2016 09:24:14 +0800
Subject: [PATCH 01/50] Make helper functions in cd use fused types

---
 sklearn/linear_model/cd_fast.pyx | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 847ef1e98cb4e..7beeb8d4f4b2b 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -13,6 +13,7 @@ import numpy.linalg as linalg
 
 cimport cython
 from cpython cimport bool
+from cython cimport floating
 import warnings
 
 ctypedef np.float64_t DOUBLE
@@ -42,13 +43,13 @@ cdef inline UINT32_t rand_int(UINT32_t end, UINT32_t* random_state) nogil:
     return our_rand_r(random_state) % end
 
 
-cdef inline double fmax(double x, double y) nogil:
+cdef inline floating fmax(floating x, floating y) nogil:
     if x > y:
         return x
     return y
 
 
-cdef inline double fsign(double f) nogil:
+cdef inline floating fsign(floating f) nogil:
     if f == 0:
         return 0
     elif f > 0:
@@ -57,11 +58,11 @@ cdef inline double fsign(double f) nogil:
         return -1.0
 
 
-cdef double abs_max(int n, double* a) nogil:
+cdef floating abs_max(int n, floating* a) nogil:
     """np.max(np.abs(a))"""
     cdef int i
-    cdef double m = fabs(a[0])
-    cdef double d
+    cdef floating m = fabs(a[0])
+    cdef floating d
     for i in range(1, n):
         d = fabs(a[i])
         if d > m:
@@ -69,11 +70,11 @@ cdef double abs_max(int n, double* a) nogil:
     return m
 
 
-cdef double max(int n, double* a) nogil:
+cdef floating max(int n, floating* a) nogil:
     """np.max(a)"""
     cdef int i
-    cdef double m = a[0]
-    cdef double d
+    cdef floating m = a[0]
+    cdef floating d
     for i in range(1, n):
         d = a[i]
         if d > m:
@@ -81,11 +82,11 @@ cdef double max(int n, double* a) nogil:
     return m
 
 
-cdef double diff_abs_max(int n, double* a, double* b) nogil:
+cdef floating diff_abs_max(int n, floating* a, floating* b) nogil:
     """np.max(np.abs(a - b))"""
     cdef int i
-    cdef double m = fabs(a[0] - b[0])
-    cdef double d
+    cdef floating m = fabs(a[0] - b[0])
+    cdef floating d
     for i in range(1, n):
         d = fabs(a[i] - b[i])
         if d > m:
@@ -331,6 +332,7 @@ def sparse_enet_coordinate_descent(double[:] w,
     cdef double normalize_sum
     cdef double gap = tol + 1.0
     cdef double d_w_tol = tol
+    cdef double dual_norm_XtA
     cdef unsigned int jj
     cdef unsigned int n_iter = 0
     cdef unsigned int f_iter

From a2756a31020eee05b76d626bf928b91e85b07364 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Wed, 22 Jun 2016 01:31:00 +0800
Subject: [PATCH 02/50] Import cblas float functions

---
 sklearn/linear_model/cd_fast.pyx | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 7beeb8d4f4b2b..74bd89598b537 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -106,19 +106,34 @@ cdef extern from "cblas.h":
 
     void daxpy "cblas_daxpy"(int N, double alpha, double *X, int incX,
                              double *Y, int incY) nogil
+    void saxpy "cblas_saxpy"(int N, float alpha, float *X, int incX,
+                             float *Y, int incY) nogil
     double ddot "cblas_ddot"(int N, double *X, int incX, double *Y, int incY
                              ) nogil
+    float sdot "cblas_sdot"(int N, float *X, int incX, float *Y, int incY
+                             ) nogil
     double dasum "cblas_dasum"(int N, double *X, int incX) nogil
+    float sasum "cblas_sasum"(int N, float *X, int incX) nogil
     void dger "cblas_dger"(CBLAS_ORDER Order, int M, int N, double alpha,
                 double *X, int incX, double *Y, int incY, double *A, int lda) nogil
+    void sger "cblas_sger"(CBLAS_ORDER Order, int M, int N, float alpha,
+                float *X, int incX, float *Y, int incY, float *A, int lda) nogil
     void dgemv "cblas_dgemv"(CBLAS_ORDER Order,
                       CBLAS_TRANSPOSE TransA, int M, int N,
                       double alpha, double *A, int lda,
                       double *X, int incX, double beta,
                       double *Y, int incY) nogil
+    void sgemv "cblas_sgemv"(CBLAS_ORDER Order,
+                      CBLAS_TRANSPOSE TransA, int M, int N,
+                      float alpha, float *A, int lda,
+                      float *X, int incX, float beta,
+                      float *Y, int incY) nogil
     double dnrm2 "cblas_dnrm2"(int N, double *X, int incX) nogil
+    float snrm2 "cblas_snrm2"(int N, float *X, int incX) nogil
     void dcopy "cblas_dcopy"(int N, double *X, int incX, double *Y, int incY) nogil
+    void scopy "cblas_scopy"(int N, float *X, int incX, float *Y, int incY) nogil
     void dscal "cblas_dscal"(int N, double alpha, double *X, int incX) nogil
+    void sscal "cblas_sscal"(int N, float alpha, float *X, int incX) nogil
 
 
 @cython.boundscheck(False)

From d204deab6945764f4db46a6294947e4889772909 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Wed, 22 Jun 2016 01:31:29 +0800
Subject: [PATCH 03/50] Make enet_coordinate_descent support fused types

---
 sklearn/linear_model/cd_fast.pyx           | 354 ++++++++++++++-------
 sklearn/linear_model/coordinate_descent.py |  22 +-
 2 files changed, 245 insertions(+), 131 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 74bd89598b537..12a8117e6a117 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -139,11 +139,11 @@ cdef extern from "cblas.h":
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
-def enet_coordinate_descent(np.ndarray[DOUBLE, ndim=1] w,
-                            double alpha, double beta,
-                            np.ndarray[DOUBLE, ndim=2, mode='fortran'] X,
-                            np.ndarray[DOUBLE, ndim=1, mode='c'] y,
-                            int max_iter, double tol,
+def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
+                            floating alpha, floating beta,
+                            np.ndarray[floating, ndim=2, mode='fortran'] X,
+                            np.ndarray[floating, ndim=1, mode='c'] y,
+                            int max_iter, floating tol,
                             object rng, bint random=0, bint positive=0):
     """Cython version of the coordinate descent algorithm
         for Elastic-Net regression
@@ -159,26 +159,34 @@ def enet_coordinate_descent(np.ndarray[DOUBLE, ndim=1] w,
     cdef unsigned int n_features = X.shape[1]
 
     # get the number of tasks indirectly, using strides
-    cdef unsigned int n_tasks = y.strides[0] / sizeof(DOUBLE)
+    cdef unsigned int n_tasks = y.strides[0] / sizeof(floating)
 
     # compute norms of the columns of X
-    cdef np.ndarray[DOUBLE, ndim=1] norm_cols_X = (X**2).sum(axis=0)
+    cdef np.ndarray[floating, ndim=1] norm_cols_X = (X**2).sum(axis=0)
 
     # initial value of the residuals
-    cdef np.ndarray[DOUBLE, ndim=1] R = np.empty(n_samples)
+    cdef np.ndarray[floating, ndim=1] R
 
-    cdef np.ndarray[DOUBLE, ndim=1] XtA = np.empty(n_features)
-    cdef double tmp
-    cdef double w_ii
-    cdef double d_w_max
-    cdef double w_max
-    cdef double d_w_ii
-    cdef double gap = tol + 1.0
-    cdef double d_w_tol = tol
-    cdef double dual_norm_XtA
-    cdef double R_norm2
-    cdef double w_norm2
-    cdef double l1_norm
+    cdef np.ndarray[floating, ndim=1] XtA
+
+    if floating is float:
+        R = np.empty(n_samples, dtype=np.float32)
+        XtA = np.empty(n_features, dtype=np.float32)
+    else:
+        R = np.empty(n_samples)
+        XtA = np.empty(n_features)
+
+    cdef floating tmp
+    cdef floating w_ii
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating dual_norm_XtA
+    cdef floating R_norm2
+    cdef floating w_norm2
+    cdef floating l1_norm
     cdef unsigned int ii
     cdef unsigned int i
     cdef unsigned int n_iter = 0
@@ -191,108 +199,212 @@ def enet_coordinate_descent(np.ndarray[DOUBLE, ndim=1] w,
             " results and is discouraged.")
 
     with nogil:
-        # R = y - np.dot(X, w)
-        for i in range(n_samples):
-            R[i] = y[i] - ddot(n_features,
-                               <DOUBLE*>(X.data + i * sizeof(DOUBLE)),
-                               n_samples, <DOUBLE*>w.data, 1)
-
-        # tol *= np.dot(y, y)
-        tol *= ddot(n_samples, <DOUBLE*>y.data, n_tasks,
-                    <DOUBLE*>y.data, n_tasks)
-
-        for n_iter in range(max_iter):
-            w_max = 0.0
-            d_w_max = 0.0
-            for f_iter in range(n_features):  # Loop over coordinates
-                if random:
-                    ii = rand_int(n_features, rand_r_state)
-                else:
-                    ii = f_iter
-
-                if norm_cols_X[ii] == 0.0:
-                    continue
-
-                w_ii = w[ii]  # Store previous value
-
-                if w_ii != 0.0:
-                    # R += w_ii * X[:,ii]
-                    daxpy(n_samples, w_ii,
-                          <DOUBLE*>(X.data + ii * n_samples * sizeof(DOUBLE)),
-                          1, <DOUBLE*>R.data, 1)
-
-                # tmp = (X[:,ii]*R).sum()
-                tmp = ddot(n_samples,
-                           <DOUBLE*>(X.data + ii * n_samples * sizeof(DOUBLE)),
-                           1, <DOUBLE*>R.data, 1)
-
-                if positive and tmp < 0:
-                    w[ii] = 0.0
-                else:
-                    w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
-                             / (norm_cols_X[ii] + beta))
-
-                if w[ii] != 0.0:
-                    # R -=  w[ii] * X[:,ii] # Update residual
-                    daxpy(n_samples, -w[ii],
-                          <DOUBLE*>(X.data + ii * n_samples * sizeof(DOUBLE)),
-                          1, <DOUBLE*>R.data, 1)
-
-                # update the maximum absolute coefficient update
-                d_w_ii = fabs(w[ii] - w_ii)
-                if d_w_ii > d_w_max:
-                    d_w_max = d_w_ii
-
-                if fabs(w[ii]) > w_max:
-                    w_max = fabs(w[ii])
-
-            if (w_max == 0.0
-                    or d_w_max / w_max < d_w_tol
-                    or n_iter == max_iter - 1):
-                # the biggest coordinate update of this iteration was smaller
-                # than the tolerance: check the duality gap as ultimate
-                # stopping criterion
-
-                # XtA = np.dot(X.T, R) - beta * w
-                for i in range(n_features):
-                    XtA[i] = ddot(
-                        n_samples,
-                        <DOUBLE*>(X.data + i * n_samples *sizeof(DOUBLE)),
-                        1, <DOUBLE*>R.data, 1) - beta * w[i]
-
-                if positive:
-                    dual_norm_XtA = max(n_features, <DOUBLE*>XtA.data)
-                else:
-                    dual_norm_XtA = abs_max(n_features, <DOUBLE*>XtA.data)
-
-                # R_norm2 = np.dot(R, R)
-                R_norm2 = ddot(n_samples, <DOUBLE*>R.data, 1,
-                               <DOUBLE*>R.data, 1)
-
-                # w_norm2 = np.dot(w, w)
-                w_norm2 = ddot(n_features, <DOUBLE*>w.data, 1,
-                               <DOUBLE*>w.data, 1)
-
-                if (dual_norm_XtA > alpha):
-                    const = alpha / dual_norm_XtA
-                    A_norm2 = R_norm2 * (const ** 2)
-                    gap = 0.5 * (R_norm2 + A_norm2)
-                else:
-                    const = 1.0
-                    gap = R_norm2
-
-                l1_norm = dasum(n_features, <DOUBLE*>w.data, 1)
-
-                # np.dot(R.T, y)
-                gap += (alpha * l1_norm - const * ddot(
+        if floating is double:
+            # R = y - np.dot(X, w)
+            for i in range(n_samples):
+                R[i] = y[i] - ddot(n_features,
+                                <DOUBLE*>(X.data + i * sizeof(DOUBLE)),
+                                n_samples, <DOUBLE*>w.data, 1)
+
+            # tol *= np.dot(y, y)
+            tol *= ddot(n_samples, <DOUBLE*>y.data, n_tasks,
+                        <DOUBLE*>y.data, n_tasks)
+
+            for n_iter in range(max_iter):
+                w_max = 0.0
+                d_w_max = 0.0
+                for f_iter in range(n_features):  # Loop over coordinates
+                    if random:
+                        ii = rand_int(n_features, rand_r_state)
+                    else:
+                        ii = f_iter
+
+                    if norm_cols_X[ii] == 0.0:
+                        continue
+
+                    w_ii = w[ii]  # Store previous value
+
+                    if w_ii != 0.0:
+                        # R += w_ii * X[:,ii]
+                        daxpy(n_samples, w_ii,
+                            <DOUBLE*>(X.data + ii * n_samples * sizeof(DOUBLE)),
+                            1, <DOUBLE*>R.data, 1)
+
+                    # tmp = (X[:,ii]*R).sum()
+                    tmp = ddot(n_samples,
+                            <DOUBLE*>(X.data + ii * n_samples * sizeof(DOUBLE)),
+                            1, <DOUBLE*>R.data, 1)
+
+                    if positive and tmp < 0:
+                        w[ii] = 0.0
+                    else:
+                        w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
+                                / (norm_cols_X[ii] + beta))
+
+                    if w[ii] != 0.0:
+                        # R -=  w[ii] * X[:,ii] # Update residual
+                        daxpy(n_samples, -w[ii],
+                            <DOUBLE*>(X.data + ii * n_samples * sizeof(DOUBLE)),
+                            1, <DOUBLE*>R.data, 1)
+
+                    # update the maximum absolute coefficient update
+                    d_w_ii = fabs(w[ii] - w_ii)
+                    if d_w_ii > d_w_max:
+                        d_w_max = d_w_ii
+
+                    if fabs(w[ii]) > w_max:
+                        w_max = fabs(w[ii])
+
+                if (w_max == 0.0
+                        or d_w_max / w_max < d_w_tol
+                        or n_iter == max_iter - 1):
+                    # the biggest coordinate update of this iteration was smaller
+                    # than the tolerance: check the duality gap as ultimate
+                    # stopping criterion
+
+                    # XtA = np.dot(X.T, R) - beta * w
+                    for i in range(n_features):
+                        XtA[i] = ddot(
                             n_samples,
-                            <DOUBLE*>R.data, 1,
-                            <DOUBLE*>y.data, n_tasks)
-                        + 0.5 * beta * (1 + const ** 2) * (w_norm2))
-
-                if gap < tol:
-                    # return if we reached desired tolerance
-                    break
+                            <DOUBLE*>(X.data + i * n_samples *sizeof(DOUBLE)),
+                            1, <DOUBLE*>R.data, 1) - beta * w[i]
+
+                    if positive:
+                        dual_norm_XtA = max(n_features, <DOUBLE*>XtA.data)
+                    else:
+                        dual_norm_XtA = abs_max(n_features, <DOUBLE*>XtA.data)
+
+                    # R_norm2 = np.dot(R, R)
+                    R_norm2 = ddot(n_samples, <DOUBLE*>R.data, 1,
+                                <DOUBLE*>R.data, 1)
+
+                    # w_norm2 = np.dot(w, w)
+                    w_norm2 = ddot(n_features, <DOUBLE*>w.data, 1,
+                                <DOUBLE*>w.data, 1)
+
+                    if (dual_norm_XtA > alpha):
+                        const = alpha / dual_norm_XtA
+                        A_norm2 = R_norm2 * (const ** 2)
+                        gap = 0.5 * (R_norm2 + A_norm2)
+                    else:
+                        const = 1.0
+                        gap = R_norm2
+
+                    l1_norm = dasum(n_features, <DOUBLE*>w.data, 1)
+
+                    # np.dot(R.T, y)
+                    gap += (alpha * l1_norm - const * ddot(
+                                n_samples,
+                                <DOUBLE*>R.data, 1,
+                                <DOUBLE*>y.data, n_tasks)
+                            + 0.5 * beta * (1 + const ** 2) * (w_norm2))
+
+                    if gap < tol:
+                        # return if we reached desired tolerance
+                        break
+        else:
+            # R = y - np.dot(X, w)
+            for i in range(n_samples):
+                R[i] = y[i] - sdot(n_features,
+                                <float*>(X.data + i * sizeof(float)),
+                                n_samples, <float*>w.data, 1)
+
+            # tol *= np.dot(y, y)
+            tol *= sdot(n_samples, <float*>y.data, n_tasks,
+                        <float*>y.data, n_tasks)
+
+            for n_iter in range(max_iter):
+                w_max = 0.0
+                d_w_max = 0.0
+                for f_iter in range(n_features):  # Loop over coordinates
+                    if random:
+                        ii = rand_int(n_features, rand_r_state)
+                    else:
+                        ii = f_iter
+
+                    if norm_cols_X[ii] == 0.0:
+                        continue
+
+                    w_ii = w[ii]  # Store previous value
+
+                    if w_ii != 0.0:
+                        # R += w_ii * X[:,ii]
+                        saxpy(n_samples, w_ii,
+                            <float*>(X.data + ii * n_samples * sizeof(float)),
+                            1, <float*>R.data, 1)
+
+                    # tmp = (X[:,ii]*R).sum()
+                    tmp = sdot(n_samples,
+                            <float*>(X.data + ii * n_samples * sizeof(float)),
+                            1, <float*>R.data, 1)
+
+                    if positive and tmp < 0:
+                        w[ii] = 0.0
+                    else:
+                        w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
+                                / (norm_cols_X[ii] + beta))
+
+                    if w[ii] != 0.0:
+                        # R -=  w[ii] * X[:,ii] # Update residual
+                        saxpy(n_samples, -w[ii],
+                            <float*>(X.data + ii * n_samples * sizeof(float)),
+                            1, <float*>R.data, 1)
+
+                    # update the maximum absolute coefficient update
+                    d_w_ii = fabs(w[ii] - w_ii)
+                    if d_w_ii > d_w_max:
+                        d_w_max = d_w_ii
+
+                    if fabs(w[ii]) > w_max:
+                        w_max = fabs(w[ii])
+
+                if (w_max == 0.0
+                        or d_w_max / w_max < d_w_tol
+                        or n_iter == max_iter - 1):
+                    # the biggest coordinate update of this iteration was smaller
+                    # than the tolerance: check the duality gap as ultimate
+                    # stopping criterion
+
+                    # XtA = np.dot(X.T, R) - beta * w
+                    for i in range(n_features):
+                        XtA[i] = sdot(
+                            n_samples,
+                            <float*>(X.data + i * n_samples *sizeof(float)),
+                            1, <float*>R.data, 1) - beta * w[i]
+
+                    if positive:
+                        dual_norm_XtA = max(n_features, <float*>XtA.data)
+                    else:
+                        dual_norm_XtA = abs_max(n_features, <float*>XtA.data)
+
+                    # R_norm2 = np.dot(R, R)
+                    R_norm2 = sdot(n_samples, <float*>R.data, 1,
+                                <float*>R.data, 1)
+
+                    # w_norm2 = np.dot(w, w)
+                    w_norm2 = sdot(n_features, <float*>w.data, 1,
+                                <float*>w.data, 1)
+
+                    if (dual_norm_XtA > alpha):
+                        const = alpha / dual_norm_XtA
+                        A_norm2 = R_norm2 * (const ** 2)
+                        gap = 0.5 * (R_norm2 + A_norm2)
+                    else:
+                        const = 1.0
+                        gap = R_norm2
+
+                    l1_norm = sasum(n_features, <float*>w.data, 1)
+
+                    # np.dot(R.T, y)
+                    gap += (alpha * l1_norm - const * sdot(
+                                n_samples,
+                                <float*>R.data, 1,
+                                <float*>y.data, n_tasks)
+                            + 0.5 * beta * (1 + const ** 2) * (w_norm2))
+
+                    if gap < tol:
+                        # return if we reached desired tolerance
+                        break
 
     return w, gap, tol, n_iter + 1
 
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 038a9830c59f3..b3f00a8a2e28c 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -375,12 +375,12 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     # We expect X and y to be already float64 Fortran ordered when bypassing
     # checks
     if check_input:
-        X = check_array(X, 'csc', dtype=np.float64, order='F', copy=copy_X)
-        y = check_array(y, 'csc', dtype=np.float64, order='F', copy=False,
+        X = check_array(X, 'csc', order='F', copy=copy_X)
+        y = check_array(y, 'csc', order='F', copy=False,
                         ensure_2d=False)
         if Xy is not None:
             # Xy should be a 1d contiguous array or a 2D C ordered array
-            Xy = check_array(Xy, dtype=np.float64, order='C', copy=False,
+            Xy = check_array(Xy, order='C', copy=False,
                              ensure_2d=False)
     n_samples, n_features = X.shape
 
@@ -426,10 +426,10 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     random = (selection == 'random')
 
     if not multi_output:
-        coefs = np.empty((n_features, n_alphas), dtype=np.float64)
+        coefs = np.empty((n_features, n_alphas), dtype=X.dtype)
     else:
         coefs = np.empty((n_outputs, n_features, n_alphas),
-                         dtype=np.float64)
+                         dtype=X.dtype)
 
     if coef_init is None:
         coef_ = np.asfortranarray(np.zeros(coefs.shape[:-1]))
@@ -457,6 +457,7 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                 coef_, l1_reg, l2_reg, precompute, Xy, y, max_iter,
                 tol, rng, random, positive)
         elif precompute is False:
+            print "model: enet_coordinate_descent"
             model = cd_fast.enet_coordinate_descent(
                 coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random,
                 positive)
@@ -654,6 +655,7 @@ def fit(self, X, y, check_input=True):
         initial data in memory directly using that format.
         """
 
+        print "test test"
         if self.alpha == 0:
             warnings.warn("With alpha=0, this algorithm does not converge "
                           "well. You are advised to use the LinearRegression "
@@ -666,12 +668,12 @@ def fit(self, X, y, check_input=True):
         # We expect X and y to be already float64 Fortran ordered arrays
         # when bypassing checks
         if check_input:
-            y = np.asarray(y, dtype=np.float64)
-            X, y = check_X_y(X, y, accept_sparse='csc', dtype=np.float64,
+            y = np.asarray(y)
+            X, y = check_X_y(X, y, accept_sparse='csc',
                              order='F',
                              copy=self.copy_X and self.fit_intercept,
                              multi_output=True, y_numeric=True)
-            y = check_array(y, dtype=np.float64, order='F', copy=False,
+            y = check_array(y, order='F', copy=False,
                             ensure_2d=False)
         X, y, X_offset, y_offset, X_scale, precompute, Xy = \
             _pre_fit(X, y, None, self.precompute, self.normalize,
@@ -688,14 +690,14 @@ def fit(self, X, y, check_input=True):
             raise ValueError("selection should be either random or cyclic.")
 
         if not self.warm_start or self.coef_ is None:
-            coef_ = np.zeros((n_targets, n_features), dtype=np.float64,
+            coef_ = np.zeros((n_targets, n_features), dtype=X.dtype,
                              order='F')
         else:
             coef_ = self.coef_
             if coef_.ndim == 1:
                 coef_ = coef_[np.newaxis, :]
 
-        dual_gaps_ = np.zeros(n_targets, dtype=np.float64)
+        dual_gaps_ = np.zeros(n_targets, dtype=X.dtype)
         self.n_iter_ = []
 
         for k in xrange(n_targets):

From c9aa51ec0983c93b6af19e4f21ff2262d273611d Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Sat, 2 Jul 2016 20:47:48 +0800
Subject: [PATCH 04/50] Make dense case work

---
 sklearn/linear_model/base.py               |  6 ++-
 sklearn/linear_model/coordinate_descent.py | 54 ++++++++++++++--------
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index f713593741726..fcbec89811ec2 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -556,9 +556,11 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
         # copy was done in fit if necessary
         X, y, X_offset, y_offset, X_scale = _preprocess_data(
             X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy)
+        X_scale = np.asarray(X_scale, dtype=X.dtype)
+
     if hasattr(precompute, '__array__') and (
-            fit_intercept and not np.allclose(X_offset, np.zeros(n_features)) or
-            normalize and not np.allclose(X_scale, np.ones(n_features))):
+            fit_intercept and not np.allclose(X_offset, np.zeros(n_features))
+            or normalize and not np.allclose(X_scale, np.ones(n_features))):
         warnings.warn("Gram matrix was provided but X was centered"
                       " to fit intercept, "
                       "or X was normalized : recomputing Gram matrix.",
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index b3f00a8a2e28c..b7fbad8387095 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -1,4 +1,4 @@
-# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#` Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 #         Fabian Pedregosa <fabian.pedregosa@inria.fr>
 #         Olivier Grisel <olivier.grisel@ensta.org>
 #         Gael Varoquaux <gael.varoquaux@inria.fr>
@@ -375,13 +375,23 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     # We expect X and y to be already float64 Fortran ordered when bypassing
     # checks
     if check_input:
-        X = check_array(X, 'csc', order='F', copy=copy_X)
-        y = check_array(y, 'csc', order='F', copy=False,
+        if X.dtype is np.float32:
+            X = check_array(X, 'csc', dtype=np.float32, order='F', copy=copy_X)
+            y = check_array(y, 'csc', dtype=np.float32, order='F', copy=False,
                         ensure_2d=False)
-        if Xy is not None:
-            # Xy should be a 1d contiguous array or a 2D C ordered array
-            Xy = check_array(Xy, order='C', copy=False,
-                             ensure_2d=False)
+            if Xy is not None:
+                # Xy should be a 1d contiguous array or a 2D C ordered array
+                Xy = check_array(Xy, dtype=np.float32, order='C', copy=False,
+                                 ensure_2d=False)
+        else:
+            X = check_array(X, 'csc', dtype=np.float64, order='F', copy=copy_X)
+            y = check_array(y, 'csc', dtype=np.float64, order='F', copy=False,
+                            ensure_2d=False)
+            if Xy is not None:
+                # Xy should be a 1d contiguous array or a 2D C ordered array
+                Xy = check_array(Xy, dtype=np.float64, order='C', copy=False,
+                                 ensure_2d=False)
+
     n_samples, n_features = X.shape
 
     multi_output = False
@@ -457,7 +467,6 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                 coef_, l1_reg, l2_reg, precompute, Xy, y, max_iter,
                 tol, rng, random, positive)
         elif precompute is False:
-            print "model: enet_coordinate_descent"
             model = cd_fast.enet_coordinate_descent(
                 coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random,
                 positive)
@@ -655,7 +664,6 @@ def fit(self, X, y, check_input=True):
         initial data in memory directly using that format.
         """
 
-        print "test test"
         if self.alpha == 0:
             warnings.warn("With alpha=0, this algorithm does not converge "
                           "well. You are advised to use the LinearRegression "
@@ -668,16 +676,27 @@ def fit(self, X, y, check_input=True):
         # We expect X and y to be already float64 Fortran ordered arrays
         # when bypassing checks
         if check_input:
-            y = np.asarray(y)
-            X, y = check_X_y(X, y, accept_sparse='csc',
-                             order='F',
-                             copy=self.copy_X and self.fit_intercept,
-                             multi_output=True, y_numeric=True)
-            y = check_array(y, order='F', copy=False,
-                            ensure_2d=False)
+            if sparse.issparse(X):
+                y = np.asarray(y, dtype=np.float64)
+                X, y = check_X_y(X, y, accept_sparse='csc',
+                                 order='F', dtype=np.float64,
+                                 copy=self.copy_X and self.fit_intercept,
+                                 multi_output=True, y_numeric=True)
+                y = check_array(y, order='F', copy=False, dtype=np.float64,
+                                ensure_2d=False)
+            else:
+                y = np.asarray(y)
+                X, y = check_X_y(X, y, accept_sparse='csc',
+                                 order='F', dtype=[np.float64, np.float32],
+                                 copy=self.copy_X and self.fit_intercept,
+                                 multi_output=True, y_numeric=True)
+                y = check_array(y, order='F', copy=False, dtype=X.dtype.type,
+                                ensure_2d=False)
+
         X, y, X_offset, y_offset, X_scale, precompute, Xy = \
             _pre_fit(X, y, None, self.precompute, self.normalize,
                      self.fit_intercept, copy=False)
+
         if y.ndim == 1:
             y = y[:, np.newaxis]
         if Xy is not None and Xy.ndim == 1:
@@ -699,7 +718,6 @@ def fit(self, X, y, check_input=True):
 
         dual_gaps_ = np.zeros(n_targets, dtype=X.dtype)
         self.n_iter_ = []
-
         for k in xrange(n_targets):
             if Xy is not None:
                 this_Xy = Xy[:, k]
@@ -725,8 +743,8 @@ def fit(self, X, y, check_input=True):
             self.n_iter_ = self.n_iter_[0]
 
         self.coef_, self.dual_gap_ = map(np.squeeze, [coef_, dual_gaps_])
-        self._set_intercept(X_offset, y_offset, X_scale)
 
+        self._set_intercept(X_offset, y_offset, X_scale)
         # return self for chaining fit and predict calls
         return self
 

From 4540ebc338e187daf359ab1e0f51046472d91653 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Sat, 2 Jul 2016 22:09:00 +0800
Subject: [PATCH 05/50] Refactor format

---
 sklearn/linear_model/coordinate_descent.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index b7fbad8387095..dde9e18124598 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -1,4 +1,4 @@
-#` Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
 #         Fabian Pedregosa <fabian.pedregosa@inria.fr>
 #         Olivier Grisel <olivier.grisel@ensta.org>
 #         Gael Varoquaux <gael.varoquaux@inria.fr>
@@ -696,7 +696,6 @@ def fit(self, X, y, check_input=True):
         X, y, X_offset, y_offset, X_scale, precompute, Xy = \
             _pre_fit(X, y, None, self.precompute, self.normalize,
                      self.fit_intercept, copy=False)
-
         if y.ndim == 1:
             y = y[:, np.newaxis]
         if Xy is not None and Xy.ndim == 1:
@@ -718,6 +717,7 @@ def fit(self, X, y, check_input=True):
 
         dual_gaps_ = np.zeros(n_targets, dtype=X.dtype)
         self.n_iter_ = []
+
         for k in xrange(n_targets):
             if Xy is not None:
                 this_Xy = Xy[:, k]
@@ -743,8 +743,8 @@ def fit(self, X, y, check_input=True):
             self.n_iter_ = self.n_iter_[0]
 
         self.coef_, self.dual_gap_ = map(np.squeeze, [coef_, dual_gaps_])
-
         self._set_intercept(X_offset, y_offset, X_scale)
+
         # return self for chaining fit and predict calls
         return self
 

From 4c1829c53f92ae27e5f5c5810aa50f7ffce2382b Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Sun, 3 Jul 2016 00:36:21 +0800
Subject: [PATCH 06/50] Remove redundant change

---
 sklearn/linear_model/base.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index fcbec89811ec2..309b722a4eff9 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -556,8 +556,6 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
         # copy was done in fit if necessary
         X, y, X_offset, y_offset, X_scale = _preprocess_data(
             X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy)
-        X_scale = np.asarray(X_scale, dtype=X.dtype)
-
     if hasattr(precompute, '__array__') and (
             fit_intercept and not np.allclose(X_offset, np.zeros(n_features))
             or normalize and not np.allclose(X_scale, np.ones(n_features))):

From efdda458c0211c263658052d31e94ea96d6dba0c Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Wed, 6 Jul 2016 10:10:04 +0800
Subject: [PATCH 07/50] Add cblas files

---
 sklearn/src/cblas/cblas_sasum.c |  44 ++++++++++++++
 sklearn/src/cblas/cblas_saxpy.c |  52 ++++++++++++++++
 sklearn/src/cblas/cblas_sgemv.c | 102 ++++++++++++++++++++++++++++++++
 sklearn/src/cblas/cblas_sger.c  |  85 ++++++++++++++++++++++++++
 sklearn/src/cblas/cblas_sscal.c |  43 ++++++++++++++
 5 files changed, 326 insertions(+)
 create mode 100644 sklearn/src/cblas/cblas_sasum.c
 create mode 100644 sklearn/src/cblas/cblas_saxpy.c
 create mode 100644 sklearn/src/cblas/cblas_sgemv.c
 create mode 100644 sklearn/src/cblas/cblas_sger.c
 create mode 100644 sklearn/src/cblas/cblas_sscal.c

diff --git a/sklearn/src/cblas/cblas_sasum.c b/sklearn/src/cblas/cblas_sasum.c
new file mode 100644
index 0000000000000..439707ba021f4
--- /dev/null
+++ b/sklearn/src/cblas/cblas_sasum.c
@@ -0,0 +1,44 @@
+/*
+ *             Automatically Tuned Linear Algebra Software v3.10.2
+ *                    (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions, and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *   3. The name of the ATLAS group or the names of its contributers may
+ *      not be used to endorse or promote products derived from this
+ *      software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define SREAL
+#include "atlas_misc.h"
+#ifdef ATL_USEPTHREADS
+   #include "atlas_ptalias1.h"
+#endif
+#include "atlas_level1.h"
+#include "cblas.h"
+
+float cblas_sasum(const int N, const float *X, const int incX)
+{
+   if (N > 0 && incX > 0)
+      return(ATL_sasum(N, X, incX));
+   return(0.0f);
+}
diff --git a/sklearn/src/cblas/cblas_saxpy.c b/sklearn/src/cblas/cblas_saxpy.c
new file mode 100644
index 0000000000000..911c17d6b02c6
--- /dev/null
+++ b/sklearn/src/cblas/cblas_saxpy.c
@@ -0,0 +1,52 @@
+/*
+ *             Automatically Tuned Linear Algebra Software v3.10.2
+ *                    (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions, and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *   3. The name of the ATLAS group or the names of its contributers may
+ *      not be used to endorse or promote products derived from this
+ *      software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define SREAL
+#include "atlas_misc.h"
+#ifdef ATL_USEPTHREADS
+   #include "atlas_ptalias1.h"
+#endif
+#include "atlas_level1.h"
+#include "cblas.h"
+
+void cblas_saxpy(const int N, const float alpha, const float *X,
+                 const int incX, float *Y, const int incY)
+{
+   if (N > 0)
+   {
+      if (incX < 0)
+      {
+         if (incY < 0) ATL_saxpy(N, alpha, X, -incX, Y, -incY);
+         else ATL_saxpy(N, alpha, X+(1-N)*incX, incX, Y, incY);
+      }
+      else if (incY < 0) ATL_saxpy(N, alpha, X+(N-1)*incX, -incX, Y, -incY);
+      else ATL_saxpy(N, alpha, X, incX, Y, incY);
+   }
+}
diff --git a/sklearn/src/cblas/cblas_sgemv.c b/sklearn/src/cblas/cblas_sgemv.c
new file mode 100644
index 0000000000000..2a2f09730baeb
--- /dev/null
+++ b/sklearn/src/cblas/cblas_sgemv.c
@@ -0,0 +1,102 @@
+/*
+ *             Automatically Tuned Linear Algebra Software v3.10.2
+ *                    (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions, and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *   3. The name of the ATLAS group or the names of its contributers may
+ *      not be used to endorse or promote products derived from this
+ *      software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define SREAL
+#include "atlas_misc.h"
+#include "cblas.h"
+#ifdef ATL_USEPTHREADS
+   #include "atlas_ptalias2.h"
+#endif
+#include "atlas_level2.h"
+
+void cblas_sgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TA,
+                 const int M, const int N, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY)
+{
+   int info = 2000;
+   #define x X
+   #define y Y
+
+#ifndef NoCblasErrorChecks
+   if (TA != CblasNoTrans && TA != CblasTrans && TA != CblasConjTrans)
+      info = cblas_errprn(2, info,
+                          "TransA must be %d, %d or %d, but is set to %d",
+                          CblasNoTrans, CblasTrans, CblasConjTrans, TA);
+
+   if (M < 0) info = cblas_errprn(3, info,
+                        "M cannot be less than zero; is set to %d.", M);
+   if (N < 0) info = cblas_errprn(4, info,
+                        "N cannot be less than zero; is set to %d.", N);
+   if (!incX) info = cblas_errprn(9, info,
+                                  "incX cannot be zero; is set to %d.", incX);
+   if (!incY) info = cblas_errprn(12, info,
+                                  "incY cannot be zero; is set to %d.", incY);
+   if (Order == CblasColMajor)
+   {
+      if (lda < M || lda < 1)
+         info = cblas_errprn(7, info, "lda must be >= MAX(M,1): lda=%d M=%d",
+                             lda, M);
+   }
+   else if (Order == CblasRowMajor)
+   {
+      if (lda < N || lda < 1)
+         info = cblas_errprn(7, info, "lda must be >= MAX(N,1): lda=%d N=%d",
+                             lda, N);
+   }
+   else
+      info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d",
+                          CblasRowMajor, CblasColMajor, Order);
+   if (info != 2000)
+   {
+      cblas_xerbla(info, "cblas_sgemv", "");
+      return;
+   }
+#endif
+   if (TA == AtlasNoTrans)
+   {
+      if (incX < 0) x += (1-N)*incX;
+      if (incY < 0) y += (1-M)*incY;
+   }
+   else
+   {
+      if (incX < 0) x += (1-M)*incX;
+      if (incY < 0) y += (1-N)*incY;
+   }
+   if (Order == CblasColMajor)
+      ATL_sgemv(TA, M, N, alpha, A, lda, x, incX, beta, y, incY);
+   else
+   {
+      if (TA == CblasNoTrans)
+         ATL_sgemv(CblasTrans, N, M, alpha, A, lda, x, incX, beta, y, incY);
+      else
+         ATL_sgemv(CblasNoTrans, N, M, alpha, A, lda, x, incX, beta, y, incY);
+   }
+}
diff --git a/sklearn/src/cblas/cblas_sger.c b/sklearn/src/cblas/cblas_sger.c
new file mode 100644
index 0000000000000..7cee9a7432faa
--- /dev/null
+++ b/sklearn/src/cblas/cblas_sger.c
@@ -0,0 +1,85 @@
+/*
+ *             Automatically Tuned Linear Algebra Software v3.10.2
+ *                    (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions, and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *   3. The name of the ATLAS group or the names of its contributers may
+ *      not be used to endorse or promote products derived from this
+ *      software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define SREAL
+#include "atlas_misc.h"
+#include "cblas.h"
+#ifdef ATL_USEPTHREADS
+   #include "atlas_ptalias2.h"
+#endif
+#include "atlas_level2.h"
+
+void cblas_sger (const enum CBLAS_ORDER Order, const int M, const int N,
+                 const float alpha, const float *X, const int incX,
+                 const float *Y, const int incY, float *A, const int lda)
+{
+   int info = 2000;
+   #define x X
+   #define y Y
+
+#ifndef NoCblasErrorChecks
+   if (M < 0) info = cblas_errprn(2, info,
+                        "M cannot be less than zero; is set to %d.", M);
+   if (N < 0) info = cblas_errprn(3, info,
+                        "N cannot be less than zero; is set to %d.", N);
+   if (!incX) info = cblas_errprn(6, info,
+                                  "incX cannot be zero; is set to %d.", incX);
+   if (!incY) info = cblas_errprn(8, info,
+                                  "incY cannot be zero; is set to %d.", incY);
+   if (Order == CblasColMajor)
+   {
+      if (lda < M || lda < 1)
+         info = cblas_errprn(10, info, "lda must be >= MAX(M,1): lda=%d M=%d",
+                             lda, M);
+   }
+   else if (Order == CblasRowMajor)
+   {
+      if (lda < N || lda < 1)
+         info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d M=%d",
+                             lda, N);
+   }
+   else
+      info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d",
+                          CblasRowMajor, CblasColMajor, Order);
+   if (info != 2000)
+   {
+      cblas_xerbla(info, "cblas_sger", "");
+      return;
+   }
+#endif
+
+   if (incX < 0) x += (1-M)*incX;
+   if (incY < 0) y += (1-N)*incY;
+
+   if (Order == CblasColMajor)
+      ATL_sger(M, N, alpha, x, incX, y, incY, A, lda);
+   else
+      ATL_sger(N, M, alpha, y, incY, x, incX, A, lda);
+}
diff --git a/sklearn/src/cblas/cblas_sscal.c b/sklearn/src/cblas/cblas_sscal.c
new file mode 100644
index 0000000000000..ea06c2be66439
--- /dev/null
+++ b/sklearn/src/cblas/cblas_sscal.c
@@ -0,0 +1,43 @@
+/*
+ *             Automatically Tuned Linear Algebra Software v3.10.2
+ *                    (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions, and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *   3. The name of the ATLAS group or the names of its contributers may
+ *      not be used to endorse or promote products derived from this
+ *      software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define SREAL
+#include "atlas_misc.h"
+#ifdef ATL_USEPTHREADS
+   #include "atlas_ptalias1.h"
+#endif
+#include "atlas_level1.h"
+#include "cblas.h"
+
+void cblas_sscal(const int N, const float alpha, float *X, const int incX)
+{
+   if (N > 0 && incX > 0)
+      ATL_sscal(N, alpha, X, incX);
+}

From a5249b646c4defd4f7a412701d4da83997e37d7f Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Thu, 7 Jul 2016 22:38:26 +0800
Subject: [PATCH 08/50] Avoid redundant code

---
 sklearn/linear_model/cd_fast.pyx | 323 +++++++++++--------------------
 1 file changed, 118 insertions(+), 205 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 12a8117e6a117..4143ce4963aa0 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -18,6 +18,11 @@ import warnings
 
 ctypedef np.float64_t DOUBLE
 ctypedef np.uint32_t UINT32_t
+ctypedef floating (*DOT)(int N, floating *X, int incX, floating *Y,
+                         int incY) nogil
+ctypedef void (*AXPY)(int N, floating alpha, floating *X, int incX,
+                          floating *Y, int incY) nogil
+ctypedef floating (*ASUM)(int N, floating *X, int incX) nogil
 
 np.import_array()
 
@@ -169,12 +174,23 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
 
     cdef np.ndarray[floating, ndim=1] XtA
 
+    # fused types version of BLAS functions
+    cdef DOT dot
+    cdef AXPY axpy
+    cdef ASUM asum
+
     if floating is float:
         R = np.empty(n_samples, dtype=np.float32)
         XtA = np.empty(n_features, dtype=np.float32)
+        dot = sdot
+        axpy = saxpy
+        asum = sasum
     else:
         R = np.empty(n_samples)
         XtA = np.empty(n_features)
+        dot = ddot
+        axpy = daxpy
+        asum = dasum
 
     cdef floating tmp
     cdef floating w_ii
@@ -187,6 +203,8 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
     cdef floating R_norm2
     cdef floating w_norm2
     cdef floating l1_norm
+    cdef floating const
+    cdef floating A_norm2
     cdef unsigned int ii
     cdef unsigned int i
     cdef unsigned int n_iter = 0
@@ -199,213 +217,108 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
             " results and is discouraged.")
 
     with nogil:
-        if floating is double:
-            # R = y - np.dot(X, w)
-            for i in range(n_samples):
-                R[i] = y[i] - ddot(n_features,
-                                <DOUBLE*>(X.data + i * sizeof(DOUBLE)),
-                                n_samples, <DOUBLE*>w.data, 1)
-
-            # tol *= np.dot(y, y)
-            tol *= ddot(n_samples, <DOUBLE*>y.data, n_tasks,
-                        <DOUBLE*>y.data, n_tasks)
-
-            for n_iter in range(max_iter):
-                w_max = 0.0
-                d_w_max = 0.0
-                for f_iter in range(n_features):  # Loop over coordinates
-                    if random:
-                        ii = rand_int(n_features, rand_r_state)
-                    else:
-                        ii = f_iter
-
-                    if norm_cols_X[ii] == 0.0:
-                        continue
-
-                    w_ii = w[ii]  # Store previous value
-
-                    if w_ii != 0.0:
-                        # R += w_ii * X[:,ii]
-                        daxpy(n_samples, w_ii,
-                            <DOUBLE*>(X.data + ii * n_samples * sizeof(DOUBLE)),
-                            1, <DOUBLE*>R.data, 1)
-
-                    # tmp = (X[:,ii]*R).sum()
-                    tmp = ddot(n_samples,
-                            <DOUBLE*>(X.data + ii * n_samples * sizeof(DOUBLE)),
-                            1, <DOUBLE*>R.data, 1)
-
-                    if positive and tmp < 0:
-                        w[ii] = 0.0
-                    else:
-                        w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
-                                / (norm_cols_X[ii] + beta))
-
-                    if w[ii] != 0.0:
-                        # R -=  w[ii] * X[:,ii] # Update residual
-                        daxpy(n_samples, -w[ii],
-                            <DOUBLE*>(X.data + ii * n_samples * sizeof(DOUBLE)),
-                            1, <DOUBLE*>R.data, 1)
-
-                    # update the maximum absolute coefficient update
-                    d_w_ii = fabs(w[ii] - w_ii)
-                    if d_w_ii > d_w_max:
-                        d_w_max = d_w_ii
-
-                    if fabs(w[ii]) > w_max:
-                        w_max = fabs(w[ii])
-
-                if (w_max == 0.0
-                        or d_w_max / w_max < d_w_tol
-                        or n_iter == max_iter - 1):
-                    # the biggest coordinate update of this iteration was smaller
-                    # than the tolerance: check the duality gap as ultimate
-                    # stopping criterion
-
-                    # XtA = np.dot(X.T, R) - beta * w
-                    for i in range(n_features):
-                        XtA[i] = ddot(
-                            n_samples,
-                            <DOUBLE*>(X.data + i * n_samples *sizeof(DOUBLE)),
-                            1, <DOUBLE*>R.data, 1) - beta * w[i]
-
-                    if positive:
-                        dual_norm_XtA = max(n_features, <DOUBLE*>XtA.data)
-                    else:
-                        dual_norm_XtA = abs_max(n_features, <DOUBLE*>XtA.data)
-
-                    # R_norm2 = np.dot(R, R)
-                    R_norm2 = ddot(n_samples, <DOUBLE*>R.data, 1,
-                                <DOUBLE*>R.data, 1)
-
-                    # w_norm2 = np.dot(w, w)
-                    w_norm2 = ddot(n_features, <DOUBLE*>w.data, 1,
-                                <DOUBLE*>w.data, 1)
-
-                    if (dual_norm_XtA > alpha):
-                        const = alpha / dual_norm_XtA
-                        A_norm2 = R_norm2 * (const ** 2)
-                        gap = 0.5 * (R_norm2 + A_norm2)
-                    else:
-                        const = 1.0
-                        gap = R_norm2
-
-                    l1_norm = dasum(n_features, <DOUBLE*>w.data, 1)
-
-                    # np.dot(R.T, y)
-                    gap += (alpha * l1_norm - const * ddot(
-                                n_samples,
-                                <DOUBLE*>R.data, 1,
-                                <DOUBLE*>y.data, n_tasks)
-                            + 0.5 * beta * (1 + const ** 2) * (w_norm2))
-
-                    if gap < tol:
-                        # return if we reached desired tolerance
-                        break
-        else:
-            # R = y - np.dot(X, w)
-            for i in range(n_samples):
-                R[i] = y[i] - sdot(n_features,
-                                <float*>(X.data + i * sizeof(float)),
-                                n_samples, <float*>w.data, 1)
-
-            # tol *= np.dot(y, y)
-            tol *= sdot(n_samples, <float*>y.data, n_tasks,
-                        <float*>y.data, n_tasks)
-
-            for n_iter in range(max_iter):
-                w_max = 0.0
-                d_w_max = 0.0
-                for f_iter in range(n_features):  # Loop over coordinates
-                    if random:
-                        ii = rand_int(n_features, rand_r_state)
-                    else:
-                        ii = f_iter
-
-                    if norm_cols_X[ii] == 0.0:
-                        continue
-
-                    w_ii = w[ii]  # Store previous value
-
-                    if w_ii != 0.0:
-                        # R += w_ii * X[:,ii]
-                        saxpy(n_samples, w_ii,
-                            <float*>(X.data + ii * n_samples * sizeof(float)),
-                            1, <float*>R.data, 1)
-
-                    # tmp = (X[:,ii]*R).sum()
-                    tmp = sdot(n_samples,
-                            <float*>(X.data + ii * n_samples * sizeof(float)),
-                            1, <float*>R.data, 1)
-
-                    if positive and tmp < 0:
-                        w[ii] = 0.0
-                    else:
-                        w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
-                                / (norm_cols_X[ii] + beta))
-
-                    if w[ii] != 0.0:
-                        # R -=  w[ii] * X[:,ii] # Update residual
-                        saxpy(n_samples, -w[ii],
-                            <float*>(X.data + ii * n_samples * sizeof(float)),
-                            1, <float*>R.data, 1)
-
-                    # update the maximum absolute coefficient update
-                    d_w_ii = fabs(w[ii] - w_ii)
-                    if d_w_ii > d_w_max:
-                        d_w_max = d_w_ii
-
-                    if fabs(w[ii]) > w_max:
-                        w_max = fabs(w[ii])
-
-                if (w_max == 0.0
-                        or d_w_max / w_max < d_w_tol
-                        or n_iter == max_iter - 1):
-                    # the biggest coordinate update of this iteration was smaller
-                    # than the tolerance: check the duality gap as ultimate
-                    # stopping criterion
-
-                    # XtA = np.dot(X.T, R) - beta * w
-                    for i in range(n_features):
-                        XtA[i] = sdot(
+        # R = y - np.dot(X, w)
+        for i in range(n_samples):
+            R[i] = y[i] - dot(n_features,
+                            <floating*>(X.data + i * sizeof(floating)),
+                            n_samples, <floating*>w.data, 1)
+
+        # tol *= np.dot(y, y)
+        tol *= dot(n_samples, <floating*>y.data, n_tasks,
+                    <floating*>y.data, n_tasks)
+
+        for n_iter in range(max_iter):
+            w_max = 0.0
+            d_w_max = 0.0
+            for f_iter in range(n_features):  # Loop over coordinates
+                if random:
+                    ii = rand_int(n_features, rand_r_state)
+                else:
+                    ii = f_iter
+
+                if norm_cols_X[ii] == 0.0:
+                    continue
+
+                w_ii = w[ii]  # Store previous value
+
+                if w_ii != 0.0:
+                    # R += w_ii * X[:,ii]
+                    axpy(n_samples, w_ii,
+                        <floating*>(X.data + ii * n_samples * sizeof(floating)),
+                        1, <floating*>R.data, 1)
+
+                # tmp = (X[:,ii]*R).sum()
+                tmp = dot(n_samples,
+                        <floating*>(X.data + ii * n_samples * sizeof(floating)),
+                        1, <floating*>R.data, 1)
+
+                if positive and tmp < 0:
+                    w[ii] = 0.0
+                else:
+                    w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
+                            / (norm_cols_X[ii] + beta))
+
+                if w[ii] != 0.0:
+                    # R -=  w[ii] * X[:,ii] # Update residual
+                    axpy(n_samples, -w[ii],
+                        <floating*>(X.data + ii * n_samples * sizeof(floating)),
+                        1, <floating*>R.data, 1)
+
+                # update the maximum absolute coefficient update
+                d_w_ii = fabs(w[ii] - w_ii)
+                if d_w_ii > d_w_max:
+                    d_w_max = d_w_ii
+
+                if fabs(w[ii]) > w_max:
+                    w_max = fabs(w[ii])
+
+            if (w_max == 0.0
+                    or d_w_max / w_max < d_w_tol
+                    or n_iter == max_iter - 1):
+                # the biggest coordinate update of this iteration was smaller
+                # than the tolerance: check the duality gap as ultimate
+                # stopping criterion
+
+                # XtA = np.dot(X.T, R) - beta * w
+                for i in range(n_features):
+                    XtA[i] = dot(
+                        n_samples,
+                        <floating*>(X.data + i * n_samples *sizeof(floating)),
+                        1, <floating*>R.data, 1) - beta * w[i]
+
+                if positive:
+                    dual_norm_XtA = max(n_features, <floating*>XtA.data)
+                else:
+                    dual_norm_XtA = abs_max(n_features, <floating*>XtA.data)
+
+                # R_norm2 = np.dot(R, R)
+                R_norm2 = dot(n_samples, <floating*>R.data, 1,
+                            <floating*>R.data, 1)
+
+                # w_norm2 = np.dot(w, w)
+                w_norm2 = dot(n_features, <floating*>w.data, 1,
+                            <floating*>w.data, 1)
+
+                if (dual_norm_XtA > alpha):
+                    const = alpha / dual_norm_XtA
+                    A_norm2 = R_norm2 * (const ** 2)
+                    gap = 0.5 * (R_norm2 + A_norm2)
+                else:
+                    const = 1.0
+                    gap = R_norm2
+
+                l1_norm = asum(n_features, <floating*>w.data, 1)
+
+                # np.dot(R.T, y)
+                gap += (alpha * l1_norm - const * dot(
                             n_samples,
-                            <float*>(X.data + i * n_samples *sizeof(float)),
-                            1, <float*>R.data, 1) - beta * w[i]
-
-                    if positive:
-                        dual_norm_XtA = max(n_features, <float*>XtA.data)
-                    else:
-                        dual_norm_XtA = abs_max(n_features, <float*>XtA.data)
-
-                    # R_norm2 = np.dot(R, R)
-                    R_norm2 = sdot(n_samples, <float*>R.data, 1,
-                                <float*>R.data, 1)
-
-                    # w_norm2 = np.dot(w, w)
-                    w_norm2 = sdot(n_features, <float*>w.data, 1,
-                                <float*>w.data, 1)
-
-                    if (dual_norm_XtA > alpha):
-                        const = alpha / dual_norm_XtA
-                        A_norm2 = R_norm2 * (const ** 2)
-                        gap = 0.5 * (R_norm2 + A_norm2)
-                    else:
-                        const = 1.0
-                        gap = R_norm2
-
-                    l1_norm = sasum(n_features, <float*>w.data, 1)
-
-                    # np.dot(R.T, y)
-                    gap += (alpha * l1_norm - const * sdot(
-                                n_samples,
-                                <float*>R.data, 1,
-                                <float*>y.data, n_tasks)
-                            + 0.5 * beta * (1 + const ** 2) * (w_norm2))
-
-                    if gap < tol:
-                        # return if we reached desired tolerance
-                        break
+                            <floating*>R.data, 1,
+                            <floating*>y.data, n_tasks)
+                        + 0.5 * beta * (1 + const ** 2) * (w_norm2))
 
+                if gap < tol:
+                    # return if we reached desired tolerance
+                    break
     return w, gap, tol, n_iter + 1
 
 

From 8339aa13fdf59565f161fd0c2f645165c42a146b Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Thu, 7 Jul 2016 22:45:39 +0800
Subject: [PATCH 09/50] Remove redundant c files and import

---
 sklearn/linear_model/cd_fast.pyx |  10 ---
 sklearn/src/cblas/cblas_sgemv.c  | 102 -------------------------------
 sklearn/src/cblas/cblas_sger.c   |  85 --------------------------
 sklearn/src/cblas/cblas_sscal.c  |  43 -------------
 4 files changed, 240 deletions(-)
 delete mode 100644 sklearn/src/cblas/cblas_sgemv.c
 delete mode 100644 sklearn/src/cblas/cblas_sger.c
 delete mode 100644 sklearn/src/cblas/cblas_sscal.c

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 4143ce4963aa0..c3fd6b72a88c1 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -121,24 +121,14 @@ cdef extern from "cblas.h":
     float sasum "cblas_sasum"(int N, float *X, int incX) nogil
     void dger "cblas_dger"(CBLAS_ORDER Order, int M, int N, double alpha,
                 double *X, int incX, double *Y, int incY, double *A, int lda) nogil
-    void sger "cblas_sger"(CBLAS_ORDER Order, int M, int N, float alpha,
-                float *X, int incX, float *Y, int incY, float *A, int lda) nogil
     void dgemv "cblas_dgemv"(CBLAS_ORDER Order,
                       CBLAS_TRANSPOSE TransA, int M, int N,
                       double alpha, double *A, int lda,
                       double *X, int incX, double beta,
                       double *Y, int incY) nogil
-    void sgemv "cblas_sgemv"(CBLAS_ORDER Order,
-                      CBLAS_TRANSPOSE TransA, int M, int N,
-                      float alpha, float *A, int lda,
-                      float *X, int incX, float beta,
-                      float *Y, int incY) nogil
     double dnrm2 "cblas_dnrm2"(int N, double *X, int incX) nogil
-    float snrm2 "cblas_snrm2"(int N, float *X, int incX) nogil
     void dcopy "cblas_dcopy"(int N, double *X, int incX, double *Y, int incY) nogil
-    void scopy "cblas_scopy"(int N, float *X, int incX, float *Y, int incY) nogil
     void dscal "cblas_dscal"(int N, double alpha, double *X, int incX) nogil
-    void sscal "cblas_sscal"(int N, float alpha, float *X, int incX) nogil
 
 
 @cython.boundscheck(False)
diff --git a/sklearn/src/cblas/cblas_sgemv.c b/sklearn/src/cblas/cblas_sgemv.c
deleted file mode 100644
index 2a2f09730baeb..0000000000000
--- a/sklearn/src/cblas/cblas_sgemv.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- *             Automatically Tuned Linear Algebra Software v3.10.2
- *                    (C) Copyright 1999 R. Clint Whaley
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *   1. Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions, and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *   3. The name of the ATLAS group or the names of its contributers may
- *      not be used to endorse or promote products derived from this
- *      software without specific written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define SREAL
-#include "atlas_misc.h"
-#include "cblas.h"
-#ifdef ATL_USEPTHREADS
-   #include "atlas_ptalias2.h"
-#endif
-#include "atlas_level2.h"
-
-void cblas_sgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TA,
-                 const int M, const int N, const float alpha, const float *A,
-                 const int lda, const float *X, const int incX,
-                 const float beta, float *Y, const int incY)
-{
-   int info = 2000;
-   #define x X
-   #define y Y
-
-#ifndef NoCblasErrorChecks
-   if (TA != CblasNoTrans && TA != CblasTrans && TA != CblasConjTrans)
-      info = cblas_errprn(2, info,
-                          "TransA must be %d, %d or %d, but is set to %d",
-                          CblasNoTrans, CblasTrans, CblasConjTrans, TA);
-
-   if (M < 0) info = cblas_errprn(3, info,
-                        "M cannot be less than zero; is set to %d.", M);
-   if (N < 0) info = cblas_errprn(4, info,
-                        "N cannot be less than zero; is set to %d.", N);
-   if (!incX) info = cblas_errprn(9, info,
-                                  "incX cannot be zero; is set to %d.", incX);
-   if (!incY) info = cblas_errprn(12, info,
-                                  "incY cannot be zero; is set to %d.", incY);
-   if (Order == CblasColMajor)
-   {
-      if (lda < M || lda < 1)
-         info = cblas_errprn(7, info, "lda must be >= MAX(M,1): lda=%d M=%d",
-                             lda, M);
-   }
-   else if (Order == CblasRowMajor)
-   {
-      if (lda < N || lda < 1)
-         info = cblas_errprn(7, info, "lda must be >= MAX(N,1): lda=%d N=%d",
-                             lda, N);
-   }
-   else
-      info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d",
-                          CblasRowMajor, CblasColMajor, Order);
-   if (info != 2000)
-   {
-      cblas_xerbla(info, "cblas_sgemv", "");
-      return;
-   }
-#endif
-   if (TA == AtlasNoTrans)
-   {
-      if (incX < 0) x += (1-N)*incX;
-      if (incY < 0) y += (1-M)*incY;
-   }
-   else
-   {
-      if (incX < 0) x += (1-M)*incX;
-      if (incY < 0) y += (1-N)*incY;
-   }
-   if (Order == CblasColMajor)
-      ATL_sgemv(TA, M, N, alpha, A, lda, x, incX, beta, y, incY);
-   else
-   {
-      if (TA == CblasNoTrans)
-         ATL_sgemv(CblasTrans, N, M, alpha, A, lda, x, incX, beta, y, incY);
-      else
-         ATL_sgemv(CblasNoTrans, N, M, alpha, A, lda, x, incX, beta, y, incY);
-   }
-}
diff --git a/sklearn/src/cblas/cblas_sger.c b/sklearn/src/cblas/cblas_sger.c
deleted file mode 100644
index 7cee9a7432faa..0000000000000
--- a/sklearn/src/cblas/cblas_sger.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- *             Automatically Tuned Linear Algebra Software v3.10.2
- *                    (C) Copyright 1999 R. Clint Whaley
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *   1. Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions, and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *   3. The name of the ATLAS group or the names of its contributers may
- *      not be used to endorse or promote products derived from this
- *      software without specific written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define SREAL
-#include "atlas_misc.h"
-#include "cblas.h"
-#ifdef ATL_USEPTHREADS
-   #include "atlas_ptalias2.h"
-#endif
-#include "atlas_level2.h"
-
-void cblas_sger (const enum CBLAS_ORDER Order, const int M, const int N,
-                 const float alpha, const float *X, const int incX,
-                 const float *Y, const int incY, float *A, const int lda)
-{
-   int info = 2000;
-   #define x X
-   #define y Y
-
-#ifndef NoCblasErrorChecks
-   if (M < 0) info = cblas_errprn(2, info,
-                        "M cannot be less than zero; is set to %d.", M);
-   if (N < 0) info = cblas_errprn(3, info,
-                        "N cannot be less than zero; is set to %d.", N);
-   if (!incX) info = cblas_errprn(6, info,
-                                  "incX cannot be zero; is set to %d.", incX);
-   if (!incY) info = cblas_errprn(8, info,
-                                  "incY cannot be zero; is set to %d.", incY);
-   if (Order == CblasColMajor)
-   {
-      if (lda < M || lda < 1)
-         info = cblas_errprn(10, info, "lda must be >= MAX(M,1): lda=%d M=%d",
-                             lda, M);
-   }
-   else if (Order == CblasRowMajor)
-   {
-      if (lda < N || lda < 1)
-         info = cblas_errprn(10, info, "lda must be >= MAX(N,1): lda=%d M=%d",
-                             lda, N);
-   }
-   else
-      info = cblas_errprn(1, info, "Order must be %d or %d, but is set to %d",
-                          CblasRowMajor, CblasColMajor, Order);
-   if (info != 2000)
-   {
-      cblas_xerbla(info, "cblas_sger", "");
-      return;
-   }
-#endif
-
-   if (incX < 0) x += (1-M)*incX;
-   if (incY < 0) y += (1-N)*incY;
-
-   if (Order == CblasColMajor)
-      ATL_sger(M, N, alpha, x, incX, y, incY, A, lda);
-   else
-      ATL_sger(N, M, alpha, y, incY, x, incX, A, lda);
-}
diff --git a/sklearn/src/cblas/cblas_sscal.c b/sklearn/src/cblas/cblas_sscal.c
deleted file mode 100644
index ea06c2be66439..0000000000000
--- a/sklearn/src/cblas/cblas_sscal.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *             Automatically Tuned Linear Algebra Software v3.10.2
- *                    (C) Copyright 1999 R. Clint Whaley
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *   1. Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions, and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *   3. The name of the ATLAS group or the names of its contributers may
- *      not be used to endorse or promote products derived from this
- *      software without specific written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define SREAL
-#include "atlas_misc.h"
-#ifdef ATL_USEPTHREADS
-   #include "atlas_ptalias1.h"
-#endif
-#include "atlas_level1.h"
-#include "cblas.h"
-
-void cblas_sscal(const int N, const float alpha, float *X, const int incX)
-{
-   if (N > 0 && incX > 0)
-      ATL_sscal(N, alpha, X, incX);
-}

From 9b8f470d4548cff46083a0324945de490f1155cc Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Thu, 7 Jul 2016 22:48:16 +0800
Subject: [PATCH 10/50] Recover unnecessary change

---
 sklearn/linear_model/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 309b722a4eff9..f713593741726 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -557,8 +557,8 @@ def _pre_fit(X, y, Xy, precompute, normalize, fit_intercept, copy):
         X, y, X_offset, y_offset, X_scale = _preprocess_data(
             X, y, fit_intercept=fit_intercept, normalize=normalize, copy=copy)
     if hasattr(precompute, '__array__') and (
-            fit_intercept and not np.allclose(X_offset, np.zeros(n_features))
-            or normalize and not np.allclose(X_scale, np.ones(n_features))):
+            fit_intercept and not np.allclose(X_offset, np.zeros(n_features)) or
+            normalize and not np.allclose(X_scale, np.ones(n_features))):
         warnings.warn("Gram matrix was provided but X was centered"
                       " to fit intercept, "
                       "or X was normalized : recomputing Gram matrix.",

From 9ac624d2291e03c08e4c2e9bd09b876dcc48ae98 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Thu, 7 Jul 2016 23:02:22 +0800
Subject: [PATCH 11/50] Update comment

---
 sklearn/linear_model/coordinate_descent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index dde9e18124598..90e6699ce5aff 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -372,7 +372,7 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     ElasticNet
     ElasticNetCV
     """
-    # We expect X and y to be already float64 Fortran ordered when bypassing
+    # We expect X and y to be already Fortran ordered when bypassing
     # checks
     if check_input:
         if X.dtype is np.float32:

From f9922090a042c84da62bce5d7a79b2b12045c3e4 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Fri, 8 Jul 2016 17:32:49 +0800
Subject: [PATCH 12/50] Make coef_ type consistent

---
 sklearn/linear_model/base.py               | 4 ++++
 sklearn/linear_model/coordinate_descent.py | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index f713593741726..58f7c5a67a10f 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -273,6 +273,10 @@ def _set_intercept(self, X_offset, y_offset, X_scale):
         """Set the intercept_
         """
         if self.fit_intercept:
+            dtype = self.coef_.dtype
+            X_offset = np.asarray(X_offset, dtype)
+            y_offset = np.asarray(y_offset, dtype)
+            X_scale = np.asarray(X_scale, dtype)
             self.coef_ = self.coef_ / X_scale
             self.intercept_ = y_offset - np.dot(X_offset, self.coef_.T)
         else:
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 90e6699ce5aff..d1030b9eb0d0c 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -442,9 +442,9 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                          dtype=X.dtype)
 
     if coef_init is None:
-        coef_ = np.asfortranarray(np.zeros(coefs.shape[:-1]))
+        coef_ = np.asfortranarray(np.zeros(coefs.shape[:-1], dtype=X.dtype))
     else:
-        coef_ = np.asfortranarray(coef_init)
+        coef_ = np.asfortranarray(coef_init, dtype=X.dtype)
 
     for i, alpha in enumerate(alphas):
         l1_reg = alpha * l1_ratio * n_samples

From 5bfeb9386ad815fddd9a66a90992aee83b8cd71b Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Fri, 8 Jul 2016 17:42:24 +0800
Subject: [PATCH 13/50] Test float32 input

---
 .../tests/test_coordinate_descent.py          | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 918180ce18915..2b965a6c49d16 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -670,3 +670,24 @@ def test_lasso_non_float_y():
         clf_float = model(fit_intercept=False)
         clf_float.fit(X, y_float)
         assert_array_equal(clf.coef_, clf_float.coef_)
+
+
+def test_enet_float_precision():
+	# Generate dataset
+    X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100,
+                                         n_informative_features=100)
+    # Here we have a small number of iterations, and thus the
+    # ElasticNet might not converge. This is to speed up tests
+    clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=100, precompute=False)
+
+    coef = {}
+    for dtype in [np.float64, np.float32]:
+        X = dtype(X)
+        y = dtype(y)
+        ignore_warnings(clf.fit)(X, y)
+
+        assert_equal(clf.coef_.dtype, dtype)
+        coef[dtype] = clf.coef_
+
+    assert_array_almost_equal(coef[np.float32], coef[np.float64],
+                              decimal=4)

From 2310766be199d0ea5de6d115596f066521abb7e6 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Fri, 8 Jul 2016 17:49:19 +0800
Subject: [PATCH 14/50] Add user warning when fitting float32 data with small
 alpha

---
 sklearn/linear_model/coordinate_descent.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index d1030b9eb0d0c..bac1a281eac68 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -482,6 +482,10 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                           ' You might want' +
                           ' to increase the number of iterations',
                           ConvergenceWarning)
+            if X.dtype == np.float32:
+                warnings.warn('It may cause by precision issues' +
+                              ' when fitting float32 data with small alpha.'
+                              ' Try to increase alpha of your model.')
 
         if verbose:
             if verbose > 2:

From 2ff201e980b19a36dd714ea2492a2729c36284d4 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Fri, 8 Jul 2016 22:53:57 +0800
Subject: [PATCH 15/50] Fix bug

---
 sklearn/linear_model/base.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 58f7c5a67a10f..ae5c447d95d3f 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -273,10 +273,11 @@ def _set_intercept(self, X_offset, y_offset, X_scale):
         """Set the intercept_
         """
         if self.fit_intercept:
-            dtype = self.coef_.dtype
-            X_offset = np.asarray(X_offset, dtype)
-            y_offset = np.asarray(y_offset, dtype)
-            X_scale = np.asarray(X_scale, dtype)
+            if isinstance(self.coef_, np.ndarray):
+                dtype = self.coef_.dtype
+                X_offset = np.asarray(X_offset, dtype)
+                y_offset = np.asarray(y_offset, dtype)
+                X_scale = np.asarray(X_scale, dtype)
             self.coef_ = self.coef_ / X_scale
             self.intercept_ = y_offset - np.dot(X_offset, self.coef_.T)
         else:

From 38c4d0664e5818d7529f12f69a3ed94c58d44e04 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Fri, 8 Jul 2016 23:13:51 +0800
Subject: [PATCH 16/50] Change variable to floating type

---
 sklearn/linear_model/cd_fast.pyx | 55 +++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index c3fd6b72a88c1..66da19a7cc5f3 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -315,14 +315,14 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
 @cython.boundscheck(False)
 @cython.wraparound(False)
 @cython.cdivision(True)
-def sparse_enet_coordinate_descent(double[:] w,
-                            double alpha, double beta,
-                            np.ndarray[double, ndim=1, mode='c'] X_data,
+def sparse_enet_coordinate_descent(floating [:] w,
+                            floating alpha, floating beta,
+                            np.ndarray[floating, ndim=1, mode='c'] X_data,
                             np.ndarray[int, ndim=1, mode='c'] X_indices,
                             np.ndarray[int, ndim=1, mode='c'] X_indptr,
-                            np.ndarray[double, ndim=1] y,
-                            double[:] X_mean, int max_iter,
-                            double tol, object rng, bint random=0,
+                            np.ndarray[floating, ndim=1] y,
+                            floating[:] X_mean, int max_iter,
+                            floating tol, object rng, bint random=0,
                             bint positive=0):
     """Cython version of the coordinate descent algorithm for Elastic-Net
 
@@ -338,31 +338,42 @@ def sparse_enet_coordinate_descent(double[:] w,
 
     # compute norms of the columns of X
     cdef unsigned int ii
-    cdef double[:] norm_cols_X = np.zeros(n_features, np.float64)
+    cdef floating[:] norm_cols_X
 
     cdef unsigned int startptr = X_indptr[0]
     cdef unsigned int endptr
 
     # get the number of tasks indirectly, using strides
-    cdef unsigned int n_tasks = y.strides[0] / sizeof(DOUBLE)
+    cdef unsigned int n_tasks
 
     # initial value of the residuals
-    cdef double[:] R = y.copy()
+    cdef floating[:] R = y.copy()
 
-    cdef double[:] X_T_R = np.zeros(n_features)
-    cdef double[:] XtA = np.zeros(n_features)
+    cdef floating[:] X_T_R
+    cdef floating[:] XtA
 
-    cdef double tmp
-    cdef double w_ii
-    cdef double d_w_max
-    cdef double w_max
-    cdef double d_w_ii
-    cdef double X_mean_ii
-    cdef double R_sum = 0.0
-    cdef double normalize_sum
-    cdef double gap = tol + 1.0
-    cdef double d_w_tol = tol
-    cdef double dual_norm_XtA
+    if floating is float:
+        norm_cols_X = np.zeros(n_features, dtype=np.float32)
+        n_tasks = y.strides[0] / sizeof(float)
+        X_T_R = np.zeros(n_features, dtype=np.float32)
+        XtA = np.zeros(n_features, dtype=np.float32)
+    else:
+        norm_cols_X = np.zeros(n_features, np.float64)
+        n_tasks = y.strides[0] / sizeof(DOUBLE)
+        X_T_R = np.zeros(n_features)
+        XtA = np.zeros(n_features)
+
+    cdef floating tmp
+    cdef floating w_ii
+    cdef floating d_w_max
+    cdef floating w_max
+    cdef floating d_w_ii
+    cdef floating X_mean_ii
+    cdef floating R_sum = 0.0
+    cdef floating normalize_sum
+    cdef floating gap = tol + 1.0
+    cdef floating d_w_tol = tol
+    cdef floating dual_norm_XtA
     cdef unsigned int jj
     cdef unsigned int n_iter = 0
     cdef unsigned int f_iter

From 75da365d0a5f217fd53c0bae1e26ce4db18966f1 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Fri, 8 Jul 2016 23:27:03 +0800
Subject: [PATCH 17/50] Make cd sparse support fused types

---
 sklearn/linear_model/cd_fast.pyx | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 66da19a7cc5f3..fbdb29c32545c 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -352,16 +352,24 @@ def sparse_enet_coordinate_descent(floating [:] w,
     cdef floating[:] X_T_R
     cdef floating[:] XtA
 
+    # fused types version of BLAS functions
+    cdef DOT dot
+    cdef ASUM asum
+
     if floating is float:
         norm_cols_X = np.zeros(n_features, dtype=np.float32)
         n_tasks = y.strides[0] / sizeof(float)
         X_T_R = np.zeros(n_features, dtype=np.float32)
         XtA = np.zeros(n_features, dtype=np.float32)
+        dot = sdot
+        asum = sasum
     else:
         norm_cols_X = np.zeros(n_features, np.float64)
         n_tasks = y.strides[0] / sizeof(DOUBLE)
         X_T_R = np.zeros(n_features)
         XtA = np.zeros(n_features)
+        dot = ddot
+        asum = dasum
 
     cdef floating tmp
     cdef floating w_ii
@@ -370,6 +378,10 @@ def sparse_enet_coordinate_descent(floating [:] w,
     cdef floating d_w_ii
     cdef floating X_mean_ii
     cdef floating R_sum = 0.0
+    cdef floating R_norm2
+    cdef floating w_norm2
+    cdef floating A_norm2
+    cdef floating l1_norm
     cdef floating normalize_sum
     cdef floating gap = tol + 1.0
     cdef floating d_w_tol = tol
@@ -406,7 +418,7 @@ def sparse_enet_coordinate_descent(floating [:] w,
             startptr = endptr
 
         # tol *= np.dot(y, y)
-        tol *= ddot(n_samples, <DOUBLE*>&y[0], 1, <DOUBLE*>&y[0], 1)
+        tol *= dot(n_samples, <floating*>&y[0], 1, <floating*>&y[0], 1)
 
         for n_iter in range(max_iter):
 
@@ -494,10 +506,10 @@ def sparse_enet_coordinate_descent(floating [:] w,
                     dual_norm_XtA = abs_max(n_features, &XtA[0])
 
                 # R_norm2 = np.dot(R, R)
-                R_norm2 = ddot(n_samples, <DOUBLE*>&R[0], 1, <DOUBLE*>&R[0], 1)
+                R_norm2 = dot(n_samples, <floating*>&R[0], 1, <floating*>&R[0], 1)
 
                 # w_norm2 = np.dot(w, w)
-                w_norm2 = ddot(n_features, <DOUBLE*>&w[0], 1, <DOUBLE*>&w[0], 1)
+                w_norm2 = dot(n_features, <floating*>&w[0], 1, <floating*>&w[0], 1)
                 if (dual_norm_XtA > alpha):
                     const = alpha / dual_norm_XtA
                     A_norm2 = R_norm2 * const**2
@@ -506,13 +518,13 @@ def sparse_enet_coordinate_descent(floating [:] w,
                     const = 1.0
                     gap = R_norm2
 
-                l1_norm = dasum(n_features, <DOUBLE*>&w[0], 1)
+                l1_norm = asum(n_features, <floating*>&w[0], 1)
 
                 # The expression inside ddot is equivalent to np.dot(R.T, y)
-                gap += (alpha * l1_norm - const * ddot(
+                gap += (alpha * l1_norm - const * dot(
                             n_samples,
-                            <DOUBLE*>&R[0], 1,
-                            <DOUBLE*>&y[0], n_tasks
+                            <floating*>&R[0], 1,
+                            <floating*>&y[0], n_tasks
                             )
                         + 0.5 * beta * (1 + const ** 2) * w_norm2)
 

From e9bee9dcff378eefe5fc4f62dcfe68a843efaf01 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Sat, 9 Jul 2016 02:46:43 +0800
Subject: [PATCH 18/50] Make CD support fused types when data is sparse

---
 sklearn/linear_model/coordinate_descent.py | 26 ++++++++--------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index bac1a281eac68..4179e42b22ff6 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -405,8 +405,9 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
             # As sparse matrices are not actually centered we need this
             # to be passed to the CD solver.
             X_sparse_scaling = params['X_offset'] / params['X_scale']
+            X_sparse_scaling = np.asarray(X_sparse_scaling, dtype=X.dtype)
         else:
-            X_sparse_scaling = np.zeros(n_features)
+            X_sparse_scaling = np.zeros(n_features, dtype=X.dtype)
 
     # X should be normalized and fit already if function is called
     # from ElasticNet.fit
@@ -680,22 +681,13 @@ def fit(self, X, y, check_input=True):
         # We expect X and y to be already float64 Fortran ordered arrays
         # when bypassing checks
         if check_input:
-            if sparse.issparse(X):
-                y = np.asarray(y, dtype=np.float64)
-                X, y = check_X_y(X, y, accept_sparse='csc',
-                                 order='F', dtype=np.float64,
-                                 copy=self.copy_X and self.fit_intercept,
-                                 multi_output=True, y_numeric=True)
-                y = check_array(y, order='F', copy=False, dtype=np.float64,
-                                ensure_2d=False)
-            else:
-                y = np.asarray(y)
-                X, y = check_X_y(X, y, accept_sparse='csc',
-                                 order='F', dtype=[np.float64, np.float32],
-                                 copy=self.copy_X and self.fit_intercept,
-                                 multi_output=True, y_numeric=True)
-                y = check_array(y, order='F', copy=False, dtype=X.dtype.type,
-                                ensure_2d=False)
+            y = np.asarray(y)
+            X, y = check_X_y(X, y, accept_sparse='csc',
+                                order='F', dtype=[np.float64, np.float32],
+                                copy=self.copy_X and self.fit_intercept,
+                                multi_output=True, y_numeric=True)
+            y = check_array(y, order='F', copy=False, dtype=X.dtype.type,
+                            ensure_2d=False)
 
         X, y, X_offset, y_offset, X_scale, precompute, Xy = \
             _pre_fit(X, y, None, self.precompute, self.normalize,

From cc9df4a2e2a58c39609392b888405725d6f25abb Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 19 Jul 2016 00:03:53 +0800
Subject: [PATCH 19/50] Add referenced src files

---
 sklearn/src/cblas/ATL_srefasum.c | 133 ++++++++++++++++++++++++++
 sklearn/src/cblas/ATL_srefaxpy.c | 157 +++++++++++++++++++++++++++++++
 2 files changed, 290 insertions(+)
 create mode 100644 sklearn/src/cblas/ATL_srefasum.c
 create mode 100644 sklearn/src/cblas/ATL_srefaxpy.c

diff --git a/sklearn/src/cblas/ATL_srefasum.c b/sklearn/src/cblas/ATL_srefasum.c
new file mode 100644
index 0000000000000..aec26caf011ac
--- /dev/null
+++ b/sklearn/src/cblas/ATL_srefasum.c
@@ -0,0 +1,133 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ *    (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.9.24 -- December 25, 2000
+ *
+ * Author         : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ *  Redistribution  and  use in  source and binary forms, with or without
+ *  modification, are  permitted provided  that the following  conditions
+ *  are met:
+ *
+ * 1. Redistributions  of  source  code  must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce  the above copyright
+ *    notice,  this list of conditions, and the  following disclaimer in
+ *    the documentation and/or other materials provided with the distri-
+ *    bution.
+ * 3. The name of the University,  the ATLAS group,  or the names of its
+ *    contributors  may not be used to endorse or promote products deri-
+ *    ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,  INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO,  PROCUREMENT  OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+/*
+ * Include files
+ */
+#include "atlas_refmisc.h"
+#include "atlas_reflevel1.h"
+
+float ATL_srefasum
+(
+   const int                  N,
+   const float                * X,
+   const int                  INCX
+)
+{
+/*
+ * Purpose
+ * =======
+ *
+ * ATL_srefasum   returns the sum of absolute values of the entries of a
+ * vector x.
+ *
+ * Arguments
+ * =========
+ *
+ * N       (input)                       const int
+ *         On entry, N specifies the length of the vector x. N  must  be
+ *         at least zero. Unchanged on exit.
+ *
+ * X       (input)                       const float *
+ *         On entry,  X  points to the  first entry to be accessed of an
+ *         incremented array of size equal to or greater than
+ *            ( 1 + ( n - 1 ) * abs( INCX ) ) * sizeof(   float   ),
+ *         that contains the vector x. Unchanged on exit.
+ *
+ * INCX    (input)                       const int
+ *         On entry, INCX specifies the increment for the elements of X.
+ *         INCX must not be zero. Unchanged on exit.
+ *
+ * ---------------------------------------------------------------------
+ */
+/*
+ * .. Local Variables ..
+ */
+   register float             sum = ATL_sZERO, x0, x1, x2, x3,
+                              x4, x5, x6, x7;
+   float                      * StX;
+   register int               i;
+   int                        nu;
+   const int                  incX2 = 2 * INCX, incX3 = 3 * INCX,
+                              incX4 = 4 * INCX, incX5 = 5 * INCX,
+                              incX6 = 6 * INCX, incX7 = 7 * INCX,
+                              incX8 = 8 * INCX;
+/* ..
+ * .. Executable Statements ..
+ *
+ */
+   if( ( N > 0 ) && ( INCX >= 1 ) )
+   {
+      if( ( nu = ( N >> 3 ) << 3 ) != 0 )
+      {
+         StX = (float *)X + nu * INCX;
+
+         do
+         {
+            x0 = (*X);     x4 = X[incX4]; x1 = X[INCX ]; x5 = X[incX5];
+            x2 = X[incX2]; x6 = X[incX6]; x3 = X[incX3]; x7 = X[incX7];
+
+            sum += Msabs( x0 ); sum += Msabs( x4 );
+            sum += Msabs( x1 ); sum += Msabs( x3 );
+            sum += Msabs( x2 ); sum += Msabs( x6 );
+            sum += Msabs( x5 ); sum += Msabs( x7 );
+
+            X  += incX8;
+
+         } while( X != StX );
+      }
+
+      for( i = N - nu; i != 0; i-- )
+      {
+         x0   = (*X);
+         sum += Msabs( x0 );
+         X   += INCX;
+      }
+   }
+   return( sum );
+/*
+ * End of ATL_srefasum
+ */
+}
diff --git a/sklearn/src/cblas/ATL_srefaxpy.c b/sklearn/src/cblas/ATL_srefaxpy.c
new file mode 100644
index 0000000000000..306e161774ffb
--- /dev/null
+++ b/sklearn/src/cblas/ATL_srefaxpy.c
@@ -0,0 +1,157 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ *    (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.9.24 -- December 25, 2000
+ *
+ * Author         : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ *  Redistribution  and  use in  source and binary forms, with or without
+ *  modification, are  permitted provided  that the following  conditions
+ *  are met:
+ *
+ * 1. Redistributions  of  source  code  must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce  the above copyright
+ *    notice,  this list of conditions, and the  following disclaimer in
+ *    the documentation and/or other materials provided with the distri-
+ *    bution.
+ * 3. The name of the University,  the ATLAS group,  or the names of its
+ *    contributors  may not be used to endorse or promote products deri-
+ *    ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,  INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO,  PROCUREMENT  OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+/*
+ * Include files
+ */
+#include "atlas_refmisc.h"
+#include "atlas_reflevel1.h"
+
+void ATL_srefaxpy
+(
+   const int                  N,
+   const float                ALPHA,
+   const float                * X,
+   const int                  INCX,
+   float                      * Y,
+   const int                  INCY
+)
+{
+/*
+ * Purpose
+ * =======
+ *
+ * ATL_srefaxpy performs the following operation:
+ *
+ *    y := y + alpha * x,
+ *
+ * where alpha is a scalar and x and y are two n-vectors.
+ *
+ * Arguments
+ * =========
+ *
+ * N       (input)                       const int
+ *         On entry, N specifies the length of the vector x. N  must  be
+ *         at least zero. Unchanged on exit.
+ *
+ * ALPHA   (input)                       const float
+ *         On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
+ *         supplied as zero, then the entries of the incremented array X
+ *         need not be set on input. Unchanged on exit.
+ *
+ * X       (input)                       const float *
+ *         On entry,  X  points to the  first entry to be accessed of an
+ *         incremented array of size equal to or greater than
+ *            ( 1 + ( n - 1 ) * abs( INCX ) ) * sizeof(   float   ),
+ *         that contains the vector x. Unchanged on exit.
+ *
+ * INCX    (input)                       const int
+ *         On entry, INCX specifies the increment for the elements of X.
+ *         INCX must not be zero. Unchanged on exit.
+ *
+ * Y       (input/output)                float *
+ *         On entry,  Y  points to the  first entry to be accessed of an
+ *         incremented array of size equal to or greater than
+ *            ( 1 + ( n - 1 ) * abs( INCY ) ) * sizeof(   float   ),
+ *         that contains the vector y.  On exit,  the entries of the in-
+ *         cremented array  Y are updated with the scaled entries of the
+ *         incremented array  X.
+ *
+ * INCY    (input)                       const int
+ *         On entry, INCY specifies the increment for the elements of Y.
+ *         INCY must not be zero. Unchanged on exit.
+ *
+ * ---------------------------------------------------------------------
+ */
+/*
+ * .. Local Variables ..
+ */
+   register const float       alpha = ALPHA;
+   register float             x0, x1, x2, x3, y0, y1, y2, y3;
+   float                      * StX;
+   register int               i;
+   int                        nu;
+   const int                  incX2 = 2 * INCX, incY2 = 2 * INCY,
+                              incX3 = 3 * INCX, incY3 = 3 * INCY,
+                              incX4 = 4 * INCX, incY4 = 4 * INCY;
+/* ..
+ * .. Executable Statements ..
+ *
+ */
+   if( ( N > 0 ) && ( alpha != ATL_sZERO ) )
+   {
+      if( ( nu = ( N >> 2 ) << 2 ) != 0 )
+      {
+         StX = (float *)X + nu * INCX;
+
+         do
+         {
+            x0 = (*X);     y0 = (*Y);     x1 = X[INCX ]; y1 = Y[INCY ];
+            x2 = X[incX2]; y2 = Y[incY2]; x3 = X[incX3]; y3 = Y[incY3];
+
+            *Y       = y0 + alpha * x0; Y[INCY ] = y1 + alpha * x1;
+            Y[incY2] = y2 + alpha * x2; Y[incY3] = y3 + alpha * x3;
+
+            X  += incX4;
+            Y  += incY4;
+
+         } while( X != StX );
+      }
+
+      for( i = N - nu; i != 0; i-- )
+      {
+         x0  = (*X);
+         y0  = (*Y);
+
+         *Y  = y0 + alpha * x0;
+
+         X  += INCX;
+         Y  += INCY;
+      }
+   }
+/*
+ * End of ATL_srefaxpy
+ */
+}

From b22d6765e4c7b98d8904f437be55cb7687c3908c Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 19 Jul 2016 01:35:24 +0800
Subject: [PATCH 20/50] Avoid type casting

---
 sklearn/linear_model/cd_fast.pyx | 55 ++++++++++++++------------------
 1 file changed, 24 insertions(+), 31 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index fbdb29c32545c..123ab08caa478 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -202,6 +202,12 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
     cdef UINT32_t rand_r_state_seed = rng.randint(0, RAND_R_MAX)
     cdef UINT32_t* rand_r_state = &rand_r_state_seed
 
+    cdef floating *X_data = <floating*> X.data
+    cdef floating *y_data = <floating*> y.data
+    cdef floating *w_data = <floating*> w.data
+    cdef floating *R_data = <floating*> R.data
+    cdef floating *XtA_data = <floating*> XtA.data
+
     if alpha == 0:
         warnings.warn("Coordinate descent with alpha=0 may lead to unexpected"
             " results and is discouraged.")
@@ -209,13 +215,10 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
     with nogil:
         # R = y - np.dot(X, w)
         for i in range(n_samples):
-            R[i] = y[i] - dot(n_features,
-                            <floating*>(X.data + i * sizeof(floating)),
-                            n_samples, <floating*>w.data, 1)
+            R[i] = y[i] - dot(n_features, &X_data[i], n_samples, w_data, 1)
 
         # tol *= np.dot(y, y)
-        tol *= dot(n_samples, <floating*>y.data, n_tasks,
-                    <floating*>y.data, n_tasks)
+        tol *= dot(n_samples, y_data, n_tasks, y_data, n_tasks)
 
         for n_iter in range(max_iter):
             w_max = 0.0
@@ -233,14 +236,11 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
 
                 if w_ii != 0.0:
                     # R += w_ii * X[:,ii]
-                    axpy(n_samples, w_ii,
-                        <floating*>(X.data + ii * n_samples * sizeof(floating)),
-                        1, <floating*>R.data, 1)
+                    axpy(n_samples, w_ii, &X_data[ii * n_samples], 1,
+                         R_data, 1)
 
                 # tmp = (X[:,ii]*R).sum()
-                tmp = dot(n_samples,
-                        <floating*>(X.data + ii * n_samples * sizeof(floating)),
-                        1, <floating*>R.data, 1)
+                tmp = dot(n_samples, &X_data[ii * n_samples], 1, R_data, 1)
 
                 if positive and tmp < 0:
                     w[ii] = 0.0
@@ -250,9 +250,8 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
 
                 if w[ii] != 0.0:
                     # R -=  w[ii] * X[:,ii] # Update residual
-                    axpy(n_samples, -w[ii],
-                        <floating*>(X.data + ii * n_samples * sizeof(floating)),
-                        1, <floating*>R.data, 1)
+                    axpy(n_samples, -w[ii], &X_data[ii * n_samples], 1,
+                         R_data, 1)
 
                 # update the maximum absolute coefficient update
                 d_w_ii = fabs(w[ii] - w_ii)
@@ -263,31 +262,27 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
                     w_max = fabs(w[ii])
 
             if (w_max == 0.0
-                    or d_w_max / w_max < d_w_tol
-                    or n_iter == max_iter - 1):
+                or d_w_max / w_max < d_w_tol
+                or n_iter == max_iter - 1):
                 # the biggest coordinate update of this iteration was smaller
                 # than the tolerance: check the duality gap as ultimate
                 # stopping criterion
 
                 # XtA = np.dot(X.T, R) - beta * w
                 for i in range(n_features):
-                    XtA[i] = dot(
-                        n_samples,
-                        <floating*>(X.data + i * n_samples *sizeof(floating)),
-                        1, <floating*>R.data, 1) - beta * w[i]
+                    XtA[i] = dot(n_samples, &X_data[i * n_samples],
+                                 1, R_data, 1) - beta * w[i]
 
                 if positive:
-                    dual_norm_XtA = max(n_features, <floating*>XtA.data)
+                    dual_norm_XtA = max(n_features, XtA_data)
                 else:
-                    dual_norm_XtA = abs_max(n_features, <floating*>XtA.data)
+                    dual_norm_XtA = abs_max(n_features, XtA_data)
 
                 # R_norm2 = np.dot(R, R)
-                R_norm2 = dot(n_samples, <floating*>R.data, 1,
-                            <floating*>R.data, 1)
+                R_norm2 = dot(n_samples, R_data, 1, R_data, 1)
 
                 # w_norm2 = np.dot(w, w)
-                w_norm2 = dot(n_features, <floating*>w.data, 1,
-                            <floating*>w.data, 1)
+                w_norm2 = dot(n_features, w_data, 1, w_data, 1)
 
                 if (dual_norm_XtA > alpha):
                     const = alpha / dual_norm_XtA
@@ -297,13 +292,11 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
                     const = 1.0
                     gap = R_norm2
 
-                l1_norm = asum(n_features, <floating*>w.data, 1)
+                l1_norm = asum(n_features, w_data, 1)
 
                 # np.dot(R.T, y)
-                gap += (alpha * l1_norm - const * dot(
-                            n_samples,
-                            <floating*>R.data, 1,
-                            <floating*>y.data, n_tasks)
+                gap += (alpha * l1_norm
+                        - const * dot(n_samples, R_data, 1, y_data, n_tasks)
                         + 0.5 * beta * (1 + const ** 2) * (w_norm2))
 
                 if gap < tol:

From d5c8d37b9a39f3e304c2516bad20db8f151accc1 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 19 Jul 2016 01:35:43 +0800
Subject: [PATCH 21/50] Fix indentation in test

---
 sklearn/linear_model/tests/test_coordinate_descent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 2b965a6c49d16..fc72364025a37 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -673,7 +673,7 @@ def test_lasso_non_float_y():
 
 
 def test_enet_float_precision():
-	# Generate dataset
+    # Generate dataset
     X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100,
                                          n_informative_features=100)
     # Here we have a small number of iterations, and thus the

From 0dcf4da0f17fc6f0538be28c61027d688834a806 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 19 Jul 2016 00:44:48 +0800
Subject: [PATCH 22/50] Avoid duplicated code

---
 sklearn/linear_model/coordinate_descent.py | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 4179e42b22ff6..4da452f15d865 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -375,22 +375,13 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     # We expect X and y to be already Fortran ordered when bypassing
     # checks
     if check_input:
-        if X.dtype is np.float32:
-            X = check_array(X, 'csc', dtype=np.float32, order='F', copy=copy_X)
-            y = check_array(y, 'csc', dtype=np.float32, order='F', copy=False,
+        X = check_array(X, 'csc', dtype=[np.float64, np.float32], order='F', copy=copy_X)
+        y = check_array(y, 'csc', dtype=X.dtype.type, order='F', copy=False,
                         ensure_2d=False)
-            if Xy is not None:
-                # Xy should be a 1d contiguous array or a 2D C ordered array
-                Xy = check_array(Xy, dtype=np.float32, order='C', copy=False,
-                                 ensure_2d=False)
-        else:
-            X = check_array(X, 'csc', dtype=np.float64, order='F', copy=copy_X)
-            y = check_array(y, 'csc', dtype=np.float64, order='F', copy=False,
-                            ensure_2d=False)
-            if Xy is not None:
-                # Xy should be a 1d contiguous array or a 2D C ordered array
-                Xy = check_array(Xy, dtype=np.float64, order='C', copy=False,
-                                 ensure_2d=False)
+        if Xy is not None:
+            # Xy should be a 1d contiguous array or a 2D C ordered array
+            Xy = check_array(Xy, dtype=X.dtype.type, order='C', copy=False,
+                             ensure_2d=False)
 
     n_samples, n_features = X.shape
 

From 0c0eef88926c64fc430be2363bbb7133752b507e Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 19 Jul 2016 03:39:24 +0800
Subject: [PATCH 23/50] Avoid type casting in sparse implementation

---
 sklearn/linear_model/cd_fast.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 123ab08caa478..eb0f26540acb6 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -411,7 +411,7 @@ def sparse_enet_coordinate_descent(floating [:] w,
             startptr = endptr
 
         # tol *= np.dot(y, y)
-        tol *= dot(n_samples, <floating*>&y[0], 1, <floating*>&y[0], 1)
+        tol *= dot(n_samples, &y[0], 1, &y[0], 1)
 
         for n_iter in range(max_iter):
 
@@ -499,10 +499,10 @@ def sparse_enet_coordinate_descent(floating [:] w,
                     dual_norm_XtA = abs_max(n_features, &XtA[0])
 
                 # R_norm2 = np.dot(R, R)
-                R_norm2 = dot(n_samples, <floating*>&R[0], 1, <floating*>&R[0], 1)
+                R_norm2 = dot(n_samples, &R[0], 1, &R[0], 1)
 
                 # w_norm2 = np.dot(w, w)
-                w_norm2 = dot(n_features, <floating*>&w[0], 1, <floating*>&w[0], 1)
+                w_norm2 = dot(n_features, &w[0], 1, &w[0], 1)
                 if (dual_norm_XtA > alpha):
                     const = alpha / dual_norm_XtA
                     A_norm2 = R_norm2 * const**2
@@ -511,13 +511,13 @@ def sparse_enet_coordinate_descent(floating [:] w,
                     const = 1.0
                     gap = R_norm2
 
-                l1_norm = asum(n_features, <floating*>&w[0], 1)
+                l1_norm = asum(n_features, &w[0], 1)
 
                 # The expression inside ddot is equivalent to np.dot(R.T, y)
                 gap += (alpha * l1_norm - const * dot(
                             n_samples,
-                            <floating*>&R[0], 1,
-                            <floating*>&y[0], n_tasks
+                            &R[0], 1,
+                            &y[0], n_tasks
                             )
                         + 0.5 * beta * (1 + const ** 2) * w_norm2)
 

From cde1d2b1d9d3b1e46c69b8d7b3ee9440d2956def Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 19 Jul 2016 04:40:50 +0800
Subject: [PATCH 24/50] Fix indentation

---
 sklearn/linear_model/cd_fast.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index eb0f26540acb6..ed73b94efa676 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -21,7 +21,7 @@ ctypedef np.uint32_t UINT32_t
 ctypedef floating (*DOT)(int N, floating *X, int incX, floating *Y,
                          int incY) nogil
 ctypedef void (*AXPY)(int N, floating alpha, floating *X, int incX,
-                          floating *Y, int incY) nogil
+                      floating *Y, int incY) nogil
 ctypedef floating (*ASUM)(int N, floating *X, int incX) nogil
 
 np.import_array()
@@ -115,8 +115,8 @@ cdef extern from "cblas.h":
                              float *Y, int incY) nogil
     double ddot "cblas_ddot"(int N, double *X, int incX, double *Y, int incY
                              ) nogil
-    float sdot "cblas_sdot"(int N, float *X, int incX, float *Y, int incY
-                             ) nogil
+    float sdot "cblas_sdot"(int N, float *X, int incX, float *Y,
+                            int incY) nogil
     double dasum "cblas_dasum"(int N, double *X, int incX) nogil
     float sasum "cblas_sasum"(int N, float *X, int incX) nogil
     void dger "cblas_dger"(CBLAS_ORDER Order, int M, int N, double alpha,

From 7bfe714a3bff9e632180d612a2faeea71bc0179a Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 19 Jul 2016 04:45:11 +0800
Subject: [PATCH 25/50] Fix duplicated intialization code

---
 sklearn/linear_model/cd_fast.pyx              | 21 ++++++++++---------
 .../tests/test_coordinate_descent.py          |  1 -
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index ed73b94efa676..93f8d9b2529ca 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -170,18 +170,19 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
     cdef ASUM asum
 
     if floating is float:
-        R = np.empty(n_samples, dtype=np.float32)
-        XtA = np.empty(n_features, dtype=np.float32)
+        dtype = np.float32
         dot = sdot
         axpy = saxpy
         asum = sasum
     else:
-        R = np.empty(n_samples)
-        XtA = np.empty(n_features)
+        dtype = np.float64
         dot = ddot
         axpy = daxpy
         asum = dasum
 
+    R = np.empty(n_samples, dtype=dtype)
+    XtA = np.empty(n_features, dtype=dtype)
+
     cdef floating tmp
     cdef floating w_ii
     cdef floating d_w_max
@@ -350,20 +351,20 @@ def sparse_enet_coordinate_descent(floating [:] w,
     cdef ASUM asum
 
     if floating is float:
-        norm_cols_X = np.zeros(n_features, dtype=np.float32)
+        dtype = np.float32
         n_tasks = y.strides[0] / sizeof(float)
-        X_T_R = np.zeros(n_features, dtype=np.float32)
-        XtA = np.zeros(n_features, dtype=np.float32)
         dot = sdot
         asum = sasum
     else:
-        norm_cols_X = np.zeros(n_features, np.float64)
+        dtype = np.float64
         n_tasks = y.strides[0] / sizeof(DOUBLE)
-        X_T_R = np.zeros(n_features)
-        XtA = np.zeros(n_features)
         dot = ddot
         asum = dasum
 
+    norm_cols_X = np.zeros(n_features, dtype=dtype)
+    X_T_R = np.zeros(n_features, dtype=dtype)
+    XtA = np.zeros(n_features, dtype=dtype)
+
     cdef floating tmp
     cdef floating w_ii
     cdef floating d_w_max
diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index fc72364025a37..56b76bb9d0fd9 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -686,7 +686,6 @@ def test_enet_float_precision():
         y = dtype(y)
         ignore_warnings(clf.fit)(X, y)
 
-        assert_equal(clf.coef_.dtype, dtype)
         coef[dtype] = clf.coef_
 
     assert_array_almost_equal(coef[np.float32], coef[np.float64],

From e65bec018dc31ac1800bc3b7aa4cad08666cdc28 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 19 Jul 2016 10:36:31 +0800
Subject: [PATCH 26/50] Follow PEP8

---
 sklearn/linear_model/cd_fast.pyx | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 93f8d9b2529ca..a2e6bf8423365 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -120,14 +120,15 @@ cdef extern from "cblas.h":
     double dasum "cblas_dasum"(int N, double *X, int incX) nogil
     float sasum "cblas_sasum"(int N, float *X, int incX) nogil
     void dger "cblas_dger"(CBLAS_ORDER Order, int M, int N, double alpha,
-                double *X, int incX, double *Y, int incY, double *A, int lda) nogil
-    void dgemv "cblas_dgemv"(CBLAS_ORDER Order,
-                      CBLAS_TRANSPOSE TransA, int M, int N,
-                      double alpha, double *A, int lda,
-                      double *X, int incX, double beta,
-                      double *Y, int incY) nogil
+                           double *X, int incX, double *Y, int incY,
+                           double *A, int lda) nogil
+    void dgemv "cblas_dgemv"(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA,
+                             int M, int N, double alpha, double *A, int lda,
+                             double *X, int incX, double beta,
+                             double *Y, int incY) nogil
     double dnrm2 "cblas_dnrm2"(int N, double *X, int incX) nogil
-    void dcopy "cblas_dcopy"(int N, double *X, int incX, double *Y, int incY) nogil
+    void dcopy "cblas_dcopy"(int N, double *X, int incX, double *Y,
+                             int incY) nogil
     void dscal "cblas_dscal"(int N, double alpha, double *X, int incX) nogil
 
 

From f4b247b52420f6f688969716f0f0bee6bee149bb Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Fri, 22 Jul 2016 18:33:47 +0800
Subject: [PATCH 27/50] Raise tmp precision to double

---
 sklearn/linear_model/cd_fast.pyx |   9 +-
 sklearn/src/cblas/ATL_dsrefdot.c | 141 +++++++++++++++++++++++++++++++
 sklearn/src/cblas/cblas_dsdot.c  |  53 ++++++++++++
 3 files changed, 201 insertions(+), 2 deletions(-)
 create mode 100644 sklearn/src/cblas/ATL_dsrefdot.c
 create mode 100644 sklearn/src/cblas/cblas_dsdot.c

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index a2e6bf8423365..47ba18e05b255 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -117,6 +117,8 @@ cdef extern from "cblas.h":
                              ) nogil
     float sdot "cblas_sdot"(int N, float *X, int incX, float *Y,
                             int incY) nogil
+    double dsdot "cblas_dsdot"(int N, float *X, int incX, float *Y,
+                               int incY) nogil
     double dasum "cblas_dasum"(int N, double *X, int incX) nogil
     float sasum "cblas_sasum"(int N, float *X, int incX) nogil
     void dger "cblas_dger"(CBLAS_ORDER Order, int M, int N, double alpha,
@@ -184,7 +186,7 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
     R = np.empty(n_samples, dtype=dtype)
     XtA = np.empty(n_features, dtype=dtype)
 
-    cdef floating tmp
+    cdef double tmp
     cdef floating w_ii
     cdef floating d_w_max
     cdef floating w_max
@@ -242,7 +244,10 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
                          R_data, 1)
 
                 # tmp = (X[:,ii]*R).sum()
-                tmp = dot(n_samples, &X_data[ii * n_samples], 1, R_data, 1)
+                if floating is float:
+                    tmp = dsdot(n_samples, &X_data[ii * n_samples], 1, R_data, 1)
+                else:
+                    tmp = dot(n_samples, &X_data[ii * n_samples], 1, R_data, 1)
 
                 if positive and tmp < 0:
                     w[ii] = 0.0
diff --git a/sklearn/src/cblas/ATL_dsrefdot.c b/sklearn/src/cblas/ATL_dsrefdot.c
new file mode 100644
index 0000000000000..442e51a08e207
--- /dev/null
+++ b/sklearn/src/cblas/ATL_dsrefdot.c
@@ -0,0 +1,141 @@
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ *    (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.2 -- December 25, 2000
+ *
+ * Author         : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
+ *
+ *  Redistribution  and  use in  source and binary forms, with or without
+ *  modification, are  permitted provided  that the following  conditions
+ *  are met:
+ *
+ * 1. Redistributions  of  source  code  must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce  the above copyright
+ *    notice,  this list of conditions, and the  following disclaimer in
+ *    the documentation and/or other materials provided with the distri-
+ *    bution.
+ * 3. The name of the University,  the ATLAS group,  or the names of its
+ *    contributors  may not be used to endorse or promote products deri-
+ *    ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,  INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO,  PROCUREMENT  OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
+ */
+/*
+ * Include files
+ */
+#include "atlas_refmisc.h"
+#include "atlas_reflevel1.h"
+
+double ATL_dsrefdot
+(
+   const int                  N,
+   const float                * X,
+   const int                  INCX,
+   const float                * Y,
+   const int                  INCY
+)
+{
+/*
+ * Purpose
+ * =======
+ *
+ * ATL_dsrefdot  returns the dot product x^T * y of two n-vectors x and
+ * y.  The result is internally computed using double precision arithme-
+ * tic.
+ *
+ * Arguments
+ * =========
+ *
+ * N       (input)                       const int
+ *         On entry, N specifies the length of the vector x. N  must  be
+ *         at least zero. Unchanged on exit.
+ *
+ * X       (input)                       const float *
+ *         On entry,  X  points to the  first entry to be accessed of an
+ *         incremented array of size equal to or greater than
+ *            ( 1 + ( n - 1 ) * abs( INCX ) ) * sizeof(   float   ),
+ *         that contains the vector x. Unchanged on exit.
+ *
+ * INCX    (input)                       const int
+ *         On entry, INCX specifies the increment for the elements of X.
+ *         INCX must not be zero. Unchanged on exit.
+ *
+ * Y       (input)                       const float *
+ *         On entry,  Y  points to the  first entry to be accessed of an
+ *         incremented array of size equal to or greater than
+ *            ( 1 + ( n - 1 ) * abs( INCY ) ) * sizeof(   float   ),
+ *         that contains the vector y. Unchanged on exit.
+ *
+ * INCY    (input)                       const int
+ *         On entry, INCY specifies the increment for the elements of Y.
+ *         INCY must not be zero. Unchanged on exit.
+ *
+ * ---------------------------------------------------------------------
+ */
+/*
+ * .. Local Variables ..
+ */
+   register double            dot = ATL_dZERO, x0, x1, x2, x3, y0, y1, y2, y3;
+   float                      * StX;
+   register int               i;
+   int                        nu;
+   const int                  incX2 = 2 * INCX, incY2 = 2 * INCY,
+                              incX3 = 3 * INCX, incY3 = 3 * INCY,
+                              incX4 = 4 * INCX, incY4 = 4 * INCY;
+/* ..
+ * .. Executable Statements ..
+ *
+ */
+   if( N > 0 )
+   {
+      if( ( nu = ( N >> 2 ) << 2 ) != 0 )
+      {
+         StX = (float *)X + nu * INCX;
+
+         do
+         {
+            x0 = (double)(*X);       y0 = (double)(*Y);
+            x1 = (double)(X[INCX ]); y1 = (double)(Y[INCY ]);
+            x2 = (double)(X[incX2]); y2 = (double)(Y[incY2]);
+            x3 = (double)(X[incX3]); y3 = (double)(Y[incY3]);
+            dot += x0 * y0; dot += x1 * y1; dot += x2 * y2; dot += x3 * y3;
+            X  += incX4;
+            Y  += incY4;
+
+         } while( X != StX );
+      }
+
+      for( i = N - nu; i != 0; i-- )
+      {
+         x0 = (double)(*X); y0 = (double)(*Y); dot += x0 * y0;
+         X += INCX; Y += INCY;
+      }
+   }
+   return( dot );
+/*
+ * End of ATL_dsrefdot
+ */
+}
diff --git a/sklearn/src/cblas/cblas_dsdot.c b/sklearn/src/cblas/cblas_dsdot.c
new file mode 100644
index 0000000000000..babf2016ccdb7
--- /dev/null
+++ b/sklearn/src/cblas/cblas_dsdot.c
@@ -0,0 +1,53 @@
+/*
+ *             Automatically Tuned Linear Algebra Software v3.10.2
+ *                    (C) Copyright 1999 R. Clint Whaley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *   1. Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions, and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *   3. The name of the ATLAS group or the names of its contributers may
+ *      not be used to endorse or promote products derived from this
+ *      software without specific written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#define SREAL
+#include "atlas_misc.h"
+#ifdef ATL_USEPTHREADS
+   #include "atlas_ptalias1.h"
+#endif
+#include "atlas_level1.h"
+#include "cblas.h"
+
+double cblas_dsdot(const int N, const float *X, const int incX,
+                   const float *Y, const int incY)
+{
+   if (N > 0)
+   {
+      if (incX < 0)
+      {
+         if (incY < 0) return(ATL_dsdot(N, X, -incX, Y, -incY));
+         else return(ATL_dsdot(N, X+(1-N)*incX, incX, Y, incY));
+      }
+      else if (incY < 0) return(ATL_dsdot(N, X+(N-1)*incX, -incX, Y, -incY));
+      else return(ATL_dsdot(N, X, incX, Y, incY));
+   }
+   else return(0.0);
+}

From e948157348d83b9b2e15208251f1442f6aa4feeb Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Mon, 25 Jul 2016 01:45:08 +0800
Subject: [PATCH 28/50] Add 64 bit computer check

---
 sklearn/linear_model/coordinate_descent.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 4da452f15d865..a0fdf2e3dd990 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -27,6 +27,8 @@
 from ..exceptions import ConvergenceWarning
 
 from . import cd_fast
+import struct
+IS_64_BIT = True if struct.calcsize("P") * 8 == 64 else False
 
 
 ###############################################################################
@@ -375,7 +377,11 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     # We expect X and y to be already Fortran ordered when bypassing
     # checks
     if check_input:
-        X = check_array(X, 'csc', dtype=[np.float64, np.float32], order='F', copy=copy_X)
+        if IS_64_BIT:
+            dtype = [np.float64, np.float32]
+        else:
+            dtype = np.float64
+        X = check_array(X, 'csc', dtype=dtype, order='F', copy=copy_X)
         y = check_array(y, 'csc', dtype=X.dtype.type, order='F', copy=False,
                         ensure_2d=False)
         if Xy is not None:
@@ -673,8 +679,12 @@ def fit(self, X, y, check_input=True):
         # when bypassing checks
         if check_input:
             y = np.asarray(y)
+            if IS_64_BIT:
+                dtype = [np.float64, np.float32]
+            else:
+                dtype = np.float64
             X, y = check_X_y(X, y, accept_sparse='csc',
-                                order='F', dtype=[np.float64, np.float32],
+                                order='F', dtype=dtype,
                                 copy=self.copy_X and self.fit_intercept,
                                 multi_output=True, y_numeric=True)
             y = check_array(y, order='F', copy=False, dtype=X.dtype.type,

From 6a15fa68e81aa61f2c5aa98e24ccf576f95a29f5 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Mon, 25 Jul 2016 08:17:08 +0800
Subject: [PATCH 29/50] Fix test

---
 sklearn/linear_model/tests/test_coordinate_descent.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 56b76bb9d0fd9..25469b2e0ee53 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -674,11 +674,10 @@ def test_lasso_non_float_y():
 
 def test_enet_float_precision():
     # Generate dataset
-    X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100,
-                                         n_informative_features=100)
+    X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10)
     # Here we have a small number of iterations, and thus the
     # ElasticNet might not converge. This is to speed up tests
-    clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=100, precompute=False)
+    clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False)
 
     coef = {}
     for dtype in [np.float64, np.float32]:

From 1591b0cba5d2e6f78df6744073b44441230a823c Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Mon, 25 Jul 2016 21:27:28 +0800
Subject: [PATCH 30/50] Add constraint

---
 sklearn/linear_model/coordinate_descent.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index a0fdf2e3dd990..c623345041ac3 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -28,7 +28,12 @@
 
 from . import cd_fast
 import struct
-IS_64_BIT = True if struct.calcsize("P") * 8 == 64 else False
+
+if (struct.calcsize("P") * 8 == 64
+    and int(np.__version__.split('.')[1]) >= 10):
+    VALID_FOR_32_INPUT = True
+else:
+    VALID_FOR_32_INPUT = False
 
 
 ###############################################################################
@@ -377,7 +382,7 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     # We expect X and y to be already Fortran ordered when bypassing
     # checks
     if check_input:
-        if IS_64_BIT:
+        if VALID_FOR_32_INPUT:
             dtype = [np.float64, np.float32]
         else:
             dtype = np.float64
@@ -679,7 +684,7 @@ def fit(self, X, y, check_input=True):
         # when bypassing checks
         if check_input:
             y = np.asarray(y)
-            if IS_64_BIT:
+            if VALID_FOR_32_INPUT:
                 dtype = [np.float64, np.float32]
             else:
                 dtype = np.float64

From 4ffaac03224eba3c4f40791ec0af24c1b7ab4147 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Wed, 27 Jul 2016 11:31:59 +0800
Subject: [PATCH 31/50] PEP 8

---
 sklearn/linear_model/cd_fast.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 47ba18e05b255..26b8cf7f92300 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -253,7 +253,7 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
                     w[ii] = 0.0
                 else:
                     w[ii] = (fsign(tmp) * fmax(fabs(tmp) - alpha, 0)
-                            / (norm_cols_X[ii] + beta))
+                             / (norm_cols_X[ii] + beta))
 
                 if w[ii] != 0.0:
                     # R -=  w[ii] * X[:,ii] # Update residual
@@ -268,9 +268,9 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
                 if fabs(w[ii]) > w_max:
                     w_max = fabs(w[ii])
 
-            if (w_max == 0.0
-                or d_w_max / w_max < d_w_tol
-                or n_iter == max_iter - 1):
+            if (w_max == 0.0 or
+                d_w_max / w_max < d_w_tol or
+                n_iter == max_iter - 1):
                 # the biggest coordinate update of this iteration was smaller
                 # than the tolerance: check the duality gap as ultimate
                 # stopping criterion

From 3d2790e08a6458e8132bd5439c9547b8ece7d801 Mon Sep 17 00:00:00 2001
From: Fabian Pedregosa <f@bianp.net>
Date: Thu, 11 Aug 2016 09:38:48 +0200
Subject: [PATCH 32/50] Make saxpy have the same structure as daxpy

Hopefully this fixes the problems outlined in PR #6913
---
 sklearn/src/cblas/ATL_srefaxpy.c | 157 -------------------------
 sklearn/src/cblas/cblas_saxpy.c  | 190 ++++++++++++++++++++++++-------
 2 files changed, 147 insertions(+), 200 deletions(-)
 delete mode 100644 sklearn/src/cblas/ATL_srefaxpy.c

diff --git a/sklearn/src/cblas/ATL_srefaxpy.c b/sklearn/src/cblas/ATL_srefaxpy.c
deleted file mode 100644
index 306e161774ffb..0000000000000
--- a/sklearn/src/cblas/ATL_srefaxpy.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/* ---------------------------------------------------------------------
- *
- * -- Automatically Tuned Linear Algebra Software (ATLAS)
- *    (C) Copyright 2000 All Rights Reserved
- *
- * -- ATLAS routine -- Version 3.9.24 -- December 25, 2000
- *
- * Author         : Antoine P. Petitet
- * Originally developed at the University of Tennessee,
- * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
- *
- * ---------------------------------------------------------------------
- *
- * -- Copyright notice and Licensing terms:
- *
- *  Redistribution  and  use in  source and binary forms, with or without
- *  modification, are  permitted provided  that the following  conditions
- *  are met:
- *
- * 1. Redistributions  of  source  code  must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce  the above copyright
- *    notice,  this list of conditions, and the  following disclaimer in
- *    the documentation and/or other materials provided with the distri-
- *    bution.
- * 3. The name of the University,  the ATLAS group,  or the names of its
- *    contributors  may not be used to endorse or promote products deri-
- *    ved from this software without specific written permission.
- *
- * -- Disclaimer:
- *
- * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
- * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,  INDIRECT, INCIDENTAL, SPE-
- * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO,  PROCUREMENT  OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
- * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  (IN-
- * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
- * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * ---------------------------------------------------------------------
- */
-/*
- * Include files
- */
-#include "atlas_refmisc.h"
-#include "atlas_reflevel1.h"
-
-void ATL_srefaxpy
-(
-   const int                  N,
-   const float                ALPHA,
-   const float                * X,
-   const int                  INCX,
-   float                      * Y,
-   const int                  INCY
-)
-{
-/*
- * Purpose
- * =======
- *
- * ATL_srefaxpy performs the following operation:
- *
- *    y := y + alpha * x,
- *
- * where alpha is a scalar and x and y are two n-vectors.
- *
- * Arguments
- * =========
- *
- * N       (input)                       const int
- *         On entry, N specifies the length of the vector x. N  must  be
- *         at least zero. Unchanged on exit.
- *
- * ALPHA   (input)                       const float
- *         On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
- *         supplied as zero, then the entries of the incremented array X
- *         need not be set on input. Unchanged on exit.
- *
- * X       (input)                       const float *
- *         On entry,  X  points to the  first entry to be accessed of an
- *         incremented array of size equal to or greater than
- *            ( 1 + ( n - 1 ) * abs( INCX ) ) * sizeof(   float   ),
- *         that contains the vector x. Unchanged on exit.
- *
- * INCX    (input)                       const int
- *         On entry, INCX specifies the increment for the elements of X.
- *         INCX must not be zero. Unchanged on exit.
- *
- * Y       (input/output)                float *
- *         On entry,  Y  points to the  first entry to be accessed of an
- *         incremented array of size equal to or greater than
- *            ( 1 + ( n - 1 ) * abs( INCY ) ) * sizeof(   float   ),
- *         that contains the vector y.  On exit,  the entries of the in-
- *         cremented array  Y are updated with the scaled entries of the
- *         incremented array  X.
- *
- * INCY    (input)                       const int
- *         On entry, INCY specifies the increment for the elements of Y.
- *         INCY must not be zero. Unchanged on exit.
- *
- * ---------------------------------------------------------------------
- */
-/*
- * .. Local Variables ..
- */
-   register const float       alpha = ALPHA;
-   register float             x0, x1, x2, x3, y0, y1, y2, y3;
-   float                      * StX;
-   register int               i;
-   int                        nu;
-   const int                  incX2 = 2 * INCX, incY2 = 2 * INCY,
-                              incX3 = 3 * INCX, incY3 = 3 * INCY,
-                              incX4 = 4 * INCX, incY4 = 4 * INCY;
-/* ..
- * .. Executable Statements ..
- *
- */
-   if( ( N > 0 ) && ( alpha != ATL_sZERO ) )
-   {
-      if( ( nu = ( N >> 2 ) << 2 ) != 0 )
-      {
-         StX = (float *)X + nu * INCX;
-
-         do
-         {
-            x0 = (*X);     y0 = (*Y);     x1 = X[INCX ]; y1 = Y[INCY ];
-            x2 = X[incX2]; y2 = Y[incY2]; x3 = X[incX3]; y3 = Y[incY3];
-
-            *Y       = y0 + alpha * x0; Y[INCY ] = y1 + alpha * x1;
-            Y[incY2] = y2 + alpha * x2; Y[incY3] = y3 + alpha * x3;
-
-            X  += incX4;
-            Y  += incY4;
-
-         } while( X != StX );
-      }
-
-      for( i = N - nu; i != 0; i-- )
-      {
-         x0  = (*X);
-         y0  = (*Y);
-
-         *Y  = y0 + alpha * x0;
-
-         X  += INCX;
-         Y  += INCY;
-      }
-   }
-/*
- * End of ATL_srefaxpy
- */
-}
diff --git a/sklearn/src/cblas/cblas_saxpy.c b/sklearn/src/cblas/cblas_saxpy.c
index 911c17d6b02c6..19600a53a5127 100644
--- a/sklearn/src/cblas/cblas_saxpy.c
+++ b/sklearn/src/cblas/cblas_saxpy.c
@@ -1,52 +1,156 @@
-/*
- *             Automatically Tuned Linear Algebra Software v3.10.2
- *                    (C) Copyright 1999 R. Clint Whaley
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *   1. Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions, and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *   3. The name of the ATLAS group or the names of its contributers may
- *      not be used to endorse or promote products derived from this
- *      software without specific written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+/* ---------------------------------------------------------------------
+ *
+ * -- Automatically Tuned Linear Algebra Software (ATLAS)
+ *    (C) Copyright 2000 All Rights Reserved
+ *
+ * -- ATLAS routine -- Version 3.9.24 -- December 25, 2000
+ *
+ * Author         : Antoine P. Petitet
+ * Originally developed at the University of Tennessee,
+ * Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * -- Copyright notice and Licensing terms:
  *
+ *  Redistribution  and  use in  source and binary forms, with or without
+ *  modification, are  permitted provided  that the following  conditions
+ *  are met:
+ *
+ * 1. Redistributions  of  source  code  must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce  the above copyright
+ *    notice,  this list of conditions, and the  following disclaimer in
+ *    the documentation and/or other materials provided with the distri-
+ *    bution.
+ * 3. The name of the University,  the ATLAS group,  or the names of its
+ *    contributors  may not be used to endorse or promote products deri-
+ *    ved from this software without specific written permission.
+ *
+ * -- Disclaimer:
+ *
+ * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,  INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO,  PROCUREMENT  OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
+ * RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT  (IN-
+ * CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---------------------------------------------------------------------
  */
+/*
+ * Include files
+ */
+#include "atlas_refmisc.h"
 
-#define SREAL
-#include "atlas_misc.h"
-#ifdef ATL_USEPTHREADS
-   #include "atlas_ptalias1.h"
-#endif
-#include "atlas_level1.h"
-#include "cblas.h"
-
-void cblas_saxpy(const int N, const float alpha, const float *X,
-                 const int incX, float *Y, const int incY)
+void cblas_saxpy
+(
+   const int                  N,
+   const float                ALPHA,
+   const float                * X,
+   const int                  INCX,
+   float                      * Y,
+   const int                  INCY
+)
 {
-   if (N > 0)
+/*
+ * Purpose
+ * =======
+ *
+ * ATL_srefaxpy performs the following operation:
+ *
+ *    y := y + alpha * x,
+ *
+ * where alpha is a scalar and x and y are two n-vectors.
+ *
+ * Arguments
+ * =========
+ *
+ * N       (input)                       const int
+ *         On entry, N specifies the length of the vector x. N  must  be
+ *         at least zero. Unchanged on exit.
+ *
+ * ALPHA   (input)                       const float
+ *         On entry, ALPHA specifies the scalar alpha.   When  ALPHA  is
+ *         supplied as zero, then the entries of the incremented array X
+ *         need not be set on input. Unchanged on exit.
+ *
+ * X       (input)                       const float *
+ *         On entry,  X  points to the  first entry to be accessed of an
+ *         incremented array of size equal to or greater than
+ *            ( 1 + ( n - 1 ) * abs( INCX ) ) * sizeof(   float   ),
+ *         that contains the vector x. Unchanged on exit.
+ *
+ * INCX    (input)                       const int
+ *         On entry, INCX specifies the increment for the elements of X.
+ *         INCX must not be zero. Unchanged on exit.
+ *
+ * Y       (input/output)                float *
+ *         On entry,  Y  points to the  first entry to be accessed of an
+ *         incremented array of size equal to or greater than
+ *            ( 1 + ( n - 1 ) * abs( INCY ) ) * sizeof(   float   ),
+ *         that contains the vector y.  On exit,  the entries of the in-
+ *         cremented array  Y are updated with the scaled entries of the
+ *         incremented array  X.
+ *
+ * INCY    (input)                       const int
+ *         On entry, INCY specifies the increment for the elements of Y.
+ *         INCY must not be zero. Unchanged on exit.
+ *
+ * ---------------------------------------------------------------------
+ */
+/*
+ * .. Local Variables ..
+ */
+   register const float       alpha = ALPHA;
+   register float             x0, x1, x2, x3, y0, y1, y2, y3;
+   float                      * StX;
+   register int               i;
+   int                        nu;
+   const int                  incX2 = 2 * INCX, incY2 = 2 * INCY,
+                              incX3 = 3 * INCX, incY3 = 3 * INCY,
+                              incX4 = 4 * INCX, incY4 = 4 * INCY;
+/* ..
+ * .. Executable Statements ..
+ *
+ */
+   if( ( N > 0 ) && ( alpha != ATL_sZERO ) )
    {
-      if (incX < 0)
+      if( ( nu = ( N >> 2 ) << 2 ) != 0 )
+      {
+         StX = (float *)X + nu * INCX;
+
+         do
+         {
+            x0 = (*X);     y0 = (*Y);     x1 = X[INCX ]; y1 = Y[INCY ];
+            x2 = X[incX2]; y2 = Y[incY2]; x3 = X[incX3]; y3 = Y[incY3];
+
+            *Y       = y0 + alpha * x0; Y[INCY ] = y1 + alpha * x1;
+            Y[incY2] = y2 + alpha * x2; Y[incY3] = y3 + alpha * x3;
+
+            X  += incX4;
+            Y  += incY4;
+
+         } while( X != StX );
+      }
+
+      for( i = N - nu; i != 0; i-- )
       {
-         if (incY < 0) ATL_saxpy(N, alpha, X, -incX, Y, -incY);
-         else ATL_saxpy(N, alpha, X+(1-N)*incX, incX, Y, incY);
+         x0  = (*X);
+         y0  = (*Y);
+
+         *Y  = y0 + alpha * x0;
+
+         X  += INCX;
+         Y  += INCY;
       }
-      else if (incY < 0) ATL_saxpy(N, alpha, X+(N-1)*incX, -incX, Y, -incY);
-      else ATL_saxpy(N, alpha, X, incX, Y, incY);
    }
+/*
+ * End of ATL_srefaxpy
+ */
 }

From c745af4443daea6a99b960a35f40ed7df9582b2c Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Fri, 12 Aug 2016 15:07:48 +0800
Subject: [PATCH 33/50] Remove wrong hardware test

---
 sklearn/linear_model/coordinate_descent.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index c623345041ac3..dc1d9db1ff237 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -27,13 +27,6 @@
 from ..exceptions import ConvergenceWarning
 
 from . import cd_fast
-import struct
-
-if (struct.calcsize("P") * 8 == 64
-    and int(np.__version__.split('.')[1]) >= 10):
-    VALID_FOR_32_INPUT = True
-else:
-    VALID_FOR_32_INPUT = False
 
 
 ###############################################################################
@@ -382,11 +375,8 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
     # We expect X and y to be already Fortran ordered when bypassing
     # checks
     if check_input:
-        if VALID_FOR_32_INPUT:
-            dtype = [np.float64, np.float32]
-        else:
-            dtype = np.float64
-        X = check_array(X, 'csc', dtype=dtype, order='F', copy=copy_X)
+        X = check_array(X, 'csc', dtype=[np.float64, np.float32],
+                        order='F', copy=copy_X)
         y = check_array(y, 'csc', dtype=X.dtype.type, order='F', copy=False,
                         ensure_2d=False)
         if Xy is not None:
@@ -684,12 +674,8 @@ def fit(self, X, y, check_input=True):
         # when bypassing checks
         if check_input:
             y = np.asarray(y)
-            if VALID_FOR_32_INPUT:
-                dtype = [np.float64, np.float32]
-            else:
-                dtype = np.float64
             X, y = check_X_y(X, y, accept_sparse='csc',
-                                order='F', dtype=dtype,
+                                order='F', dtype=[np.float64, np.float32],
                                 copy=self.copy_X and self.fit_intercept,
                                 multi_output=True, y_numeric=True)
             y = check_array(y, order='F', copy=False, dtype=X.dtype.type,

From 8b04c53842dee2fd624dfc6b36280851014e620a Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Fri, 12 Aug 2016 16:27:38 +0800
Subject: [PATCH 34/50] Remove dsdot

---
 sklearn/linear_model/cd_fast.pyx |  9 ++----
 sklearn/src/cblas/cblas_dsdot.c  | 53 --------------------------------
 2 files changed, 2 insertions(+), 60 deletions(-)
 delete mode 100644 sklearn/src/cblas/cblas_dsdot.c

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 26b8cf7f92300..7a6548b4468a1 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -117,8 +117,6 @@ cdef extern from "cblas.h":
                              ) nogil
     float sdot "cblas_sdot"(int N, float *X, int incX, float *Y,
                             int incY) nogil
-    double dsdot "cblas_dsdot"(int N, float *X, int incX, float *Y,
-                               int incY) nogil
     double dasum "cblas_dasum"(int N, double *X, int incX) nogil
     float sasum "cblas_sasum"(int N, float *X, int incX) nogil
     void dger "cblas_dger"(CBLAS_ORDER Order, int M, int N, double alpha,
@@ -186,7 +184,7 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
     R = np.empty(n_samples, dtype=dtype)
     XtA = np.empty(n_features, dtype=dtype)
 
-    cdef double tmp
+    cdef floating tmp
     cdef floating w_ii
     cdef floating d_w_max
     cdef floating w_max
@@ -244,10 +242,7 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
                          R_data, 1)
 
                 # tmp = (X[:,ii]*R).sum()
-                if floating is float:
-                    tmp = dsdot(n_samples, &X_data[ii * n_samples], 1, R_data, 1)
-                else:
-                    tmp = dot(n_samples, &X_data[ii * n_samples], 1, R_data, 1)
+                tmp = dot(n_samples, &X_data[ii * n_samples], 1, R_data, 1)
 
                 if positive and tmp < 0:
                     w[ii] = 0.0
diff --git a/sklearn/src/cblas/cblas_dsdot.c b/sklearn/src/cblas/cblas_dsdot.c
deleted file mode 100644
index babf2016ccdb7..0000000000000
--- a/sklearn/src/cblas/cblas_dsdot.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *             Automatically Tuned Linear Algebra Software v3.10.2
- *                    (C) Copyright 1999 R. Clint Whaley
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *   1. Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *   2. Redistributions in binary form must reproduce the above copyright
- *      notice, this list of conditions, and the following disclaimer in the
- *      documentation and/or other materials provided with the distribution.
- *   3. The name of the ATLAS group or the names of its contributers may
- *      not be used to endorse or promote products derived from this
- *      software without specific written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#define SREAL
-#include "atlas_misc.h"
-#ifdef ATL_USEPTHREADS
-   #include "atlas_ptalias1.h"
-#endif
-#include "atlas_level1.h"
-#include "cblas.h"
-
-double cblas_dsdot(const int N, const float *X, const int incX,
-                   const float *Y, const int incY)
-{
-   if (N > 0)
-   {
-      if (incX < 0)
-      {
-         if (incY < 0) return(ATL_dsdot(N, X, -incX, Y, -incY));
-         else return(ATL_dsdot(N, X+(1-N)*incX, incX, Y, incY));
-      }
-      else if (incY < 0) return(ATL_dsdot(N, X+(N-1)*incX, -incX, Y, -incY));
-      else return(ATL_dsdot(N, X, incX, Y, incY));
-   }
-   else return(0.0);
-}

From b035f34c67d08ea3760abbf9728adab3c5d0c90a Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 16 Aug 2016 00:21:26 +0800
Subject: [PATCH 35/50] Remove redundant asarray

---
 sklearn/linear_model/coordinate_descent.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index dc1d9db1ff237..62e2f0686f4f1 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -673,7 +673,6 @@ def fit(self, X, y, check_input=True):
         # We expect X and y to be already float64 Fortran ordered arrays
         # when bypassing checks
         if check_input:
-            y = np.asarray(y)
             X, y = check_X_y(X, y, accept_sparse='csc',
                                 order='F', dtype=[np.float64, np.float32],
                                 copy=self.copy_X and self.fit_intercept,

From c967912bbc7637ca7ce6b6e0d71d9fc67a7a7f96 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 16 Aug 2016 00:40:28 +0800
Subject: [PATCH 36/50] Add test for fit_intercept

---
 .../tests/test_coordinate_descent.py          | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 25469b2e0ee53..77129f6654bd4 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -677,15 +677,17 @@ def test_enet_float_precision():
     X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10)
     # Here we have a small number of iterations, and thus the
     # ElasticNet might not converge. This is to speed up tests
-    clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False)
 
-    coef = {}
-    for dtype in [np.float64, np.float32]:
-        X = dtype(X)
-        y = dtype(y)
-        ignore_warnings(clf.fit)(X, y)
+    for fit_intercept in [True, False]:
+        coef = {}
+        clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False,
+                         fit_intercept=fit_intercept)
+        for dtype in [np.float64, np.float32]:
+            X = dtype(X)
+            y = dtype(y)
+            ignore_warnings(clf.fit)(X, y)
 
-        coef[dtype] = clf.coef_
+            coef[dtype] = clf.coef_
 
-    assert_array_almost_equal(coef[np.float32], coef[np.float64],
-                              decimal=4)
+        assert_array_almost_equal(coef[np.float32], coef[np.float64],
+                                  decimal=4)

From 45b4aaa9626bd2f2b7c76c1c8e868bdbb9908db2 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 16 Aug 2016 13:14:26 +0800
Subject: [PATCH 37/50] Make _preprocess_data support other dtypes

---
 sklearn/linear_model/base.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index ae5c447d95d3f..690ff288b94c3 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -171,7 +171,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
         if sp.issparse(X):
             X_offset, X_var = mean_variance_axis(X, axis=0)
             if not return_mean:
-                X_offset = np.zeros(X.shape[1])
+                X_offset = np.zeros(X.shape[1], dtype=X.dtype)
 
             if normalize:
 
@@ -186,7 +186,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
                 X_scale[X_scale == 0] = 1
                 inplace_column_scale(X, 1. / X_scale)
             else:
-                X_scale = np.ones(X.shape[1])
+                X_scale = np.ones(X.shape[1], dtype=X.dtype)
 
         else:
             X_offset = np.average(X, axis=0, weights=sample_weight)
@@ -195,12 +195,12 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
                 X, X_scale = f_normalize(X, axis=0, copy=False,
                                          return_norm=True)
             else:
-                X_scale = np.ones(X.shape[1])
+                X_scale = np.ones(X.shape[1], dtype=X.dtype)
         y_offset = np.average(y, axis=0, weights=sample_weight)
         y = y - y_offset
     else:
-        X_offset = np.zeros(X.shape[1])
-        X_scale = np.ones(X.shape[1])
+        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
+        X_scale = np.ones(X.shape[1], dtype=X.dtype)
         y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
 
     return X, y, X_offset, y_offset, X_scale
@@ -273,11 +273,6 @@ def _set_intercept(self, X_offset, y_offset, X_scale):
         """Set the intercept_
         """
         if self.fit_intercept:
-            if isinstance(self.coef_, np.ndarray):
-                dtype = self.coef_.dtype
-                X_offset = np.asarray(X_offset, dtype)
-                y_offset = np.asarray(y_offset, dtype)
-                X_scale = np.asarray(X_scale, dtype)
             self.coef_ = self.coef_ / X_scale
             self.intercept_ = y_offset - np.dot(X_offset, self.coef_.T)
         else:

From dd4a42ea6a1aa219d24f79cd1886e7e259cd2969 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 16 Aug 2016 13:49:00 +0800
Subject: [PATCH 38/50] Add concrete value

---
 sklearn/linear_model/coordinate_descent.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 62e2f0686f4f1..e4a4c25889351 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -477,8 +477,9 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
                           ConvergenceWarning)
             if X.dtype == np.float32:
                 warnings.warn('It may cause by precision issues' +
-                              ' when fitting float32 data with small alpha.'
-                              ' Try to increase alpha of your model.')
+                              ' when fitting float32 data with small alpha,' +
+                              ' e.g., 1e-8.' +
+                              'Try to increase alpha of your model.')
 
         if verbose:
             if verbose > 2:

From 23b6c2ab2be440041ed664e9a5ec442b56071b28 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Thu, 18 Aug 2016 11:49:13 +0800
Subject: [PATCH 39/50] Workaround

---
 sklearn/linear_model/base.py               | 10 +++++-----
 sklearn/linear_model/coordinate_descent.py |  3 +++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
index 690ff288b94c3..f713593741726 100644
--- a/sklearn/linear_model/base.py
+++ b/sklearn/linear_model/base.py
@@ -171,7 +171,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
         if sp.issparse(X):
             X_offset, X_var = mean_variance_axis(X, axis=0)
             if not return_mean:
-                X_offset = np.zeros(X.shape[1], dtype=X.dtype)
+                X_offset = np.zeros(X.shape[1])
 
             if normalize:
 
@@ -186,7 +186,7 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
                 X_scale[X_scale == 0] = 1
                 inplace_column_scale(X, 1. / X_scale)
             else:
-                X_scale = np.ones(X.shape[1], dtype=X.dtype)
+                X_scale = np.ones(X.shape[1])
 
         else:
             X_offset = np.average(X, axis=0, weights=sample_weight)
@@ -195,12 +195,12 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
                 X, X_scale = f_normalize(X, axis=0, copy=False,
                                          return_norm=True)
             else:
-                X_scale = np.ones(X.shape[1], dtype=X.dtype)
+                X_scale = np.ones(X.shape[1])
         y_offset = np.average(y, axis=0, weights=sample_weight)
         y = y - y_offset
     else:
-        X_offset = np.zeros(X.shape[1], dtype=X.dtype)
-        X_scale = np.ones(X.shape[1], dtype=X.dtype)
+        X_offset = np.zeros(X.shape[1])
+        X_scale = np.ones(X.shape[1])
         y_offset = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype)
 
     return X, y, X_offset, y_offset, X_scale
diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index e4a4c25889351..e5d8910a0e70d 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -733,6 +733,9 @@ def fit(self, X, y, check_input=True):
         self.coef_, self.dual_gap_ = map(np.squeeze, [coef_, dual_gaps_])
         self._set_intercept(X_offset, y_offset, X_scale)
 
+        # workaround since _set_intercept will cast self.coef_ into float64
+        self.coef_ = np.asarray(self.coef_, dtype=X.dtype)
+
         # return self for chaining fit and predict calls
         return self
 

From 116ec7965b0e0f565621ecb02076cf0535532cc0 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Sun, 21 Aug 2016 14:39:50 +0800
Subject: [PATCH 40/50] Fix error msg

---
 sklearn/linear_model/coordinate_descent.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index e5d8910a0e70d..529e6c8380ebd 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -473,13 +473,10 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         if dual_gap_ > eps_:
             warnings.warn('Objective did not converge.' +
                           ' You might want' +
-                          ' to increase the number of iterations',
+                          ' to increase the number of iterations.' +
+                          ' Fitting float32 data with small alpha,' +
+                          ' e.g., 1e-8, may cause precision issues.',
                           ConvergenceWarning)
-            if X.dtype == np.float32:
-                warnings.warn('It may cause by precision issues' +
-                              ' when fitting float32 data with small alpha,' +
-                              ' e.g., 1e-8.' +
-                              'Try to increase alpha of your model.')
 
         if verbose:
             if verbose > 2:

From 470d8ab730f89c63dab491feb9cf59b54b7e443f Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Sun, 21 Aug 2016 14:45:02 +0800
Subject: [PATCH 41/50] Move declarartion

---
 sklearn/linear_model/cd_fast.pyx | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 7a6548b4468a1..2618093b59dda 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -150,21 +150,6 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
 
     """
 
-    # get the data information into easy vars
-    cdef unsigned int n_samples = X.shape[0]
-    cdef unsigned int n_features = X.shape[1]
-
-    # get the number of tasks indirectly, using strides
-    cdef unsigned int n_tasks = y.strides[0] / sizeof(floating)
-
-    # compute norms of the columns of X
-    cdef np.ndarray[floating, ndim=1] norm_cols_X = (X**2).sum(axis=0)
-
-    # initial value of the residuals
-    cdef np.ndarray[floating, ndim=1] R
-
-    cdef np.ndarray[floating, ndim=1] XtA
-
     # fused types version of BLAS functions
     cdef DOT dot
     cdef AXPY axpy
@@ -181,8 +166,19 @@ def enet_coordinate_descent(np.ndarray[floating, ndim=1] w,
         axpy = daxpy
         asum = dasum
 
-    R = np.empty(n_samples, dtype=dtype)
-    XtA = np.empty(n_features, dtype=dtype)
+    # get the data information into easy vars
+    cdef unsigned int n_samples = X.shape[0]
+    cdef unsigned int n_features = X.shape[1]
+
+    # get the number of tasks indirectly, using strides
+    cdef unsigned int n_tasks = y.strides[0] / sizeof(floating)
+
+    # compute norms of the columns of X
+    cdef np.ndarray[floating, ndim=1] norm_cols_X = (X**2).sum(axis=0)
+
+    # initial value of the residuals
+    cdef np.ndarray[floating, ndim=1] R = np.empty(n_samples, dtype=dtype)
+    cdef np.ndarray[floating, ndim=1] XtA = np.empty(n_features, dtype=dtype)
 
     cdef floating tmp
     cdef floating w_ii

From f868af7df741b3467ccf3bfea887870595e1643a Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Sun, 21 Aug 2016 14:56:12 +0800
Subject: [PATCH 42/50] Remove redundant comment

---
 sklearn/linear_model/cd_fast.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/linear_model/cd_fast.pyx b/sklearn/linear_model/cd_fast.pyx
index 2618093b59dda..1dcd10feb1063 100644
--- a/sklearn/linear_model/cd_fast.pyx
+++ b/sklearn/linear_model/cd_fast.pyx
@@ -511,7 +511,6 @@ def sparse_enet_coordinate_descent(floating [:] w,
 
                 l1_norm = asum(n_features, &w[0], 1)
 
-                # The expression inside ddot is equivalent to np.dot(R.T, y)
                 gap += (alpha * l1_norm - const * dot(
                             n_samples,
                             &R[0], 1,

From 0e88af206dd7ba831cfb63df37d7be8e78c4c583 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Sun, 21 Aug 2016 15:12:51 +0800
Subject: [PATCH 43/50] Add tests

---
 sklearn/linear_model/tests/test_coordinate_descent.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index 77129f6654bd4..a513abfc9f8f8 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -680,6 +680,7 @@ def test_enet_float_precision():
 
     for fit_intercept in [True, False]:
         coef = {}
+        intercept = {}
         clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False,
                          fit_intercept=fit_intercept)
         for dtype in [np.float64, np.float32]:
@@ -688,6 +689,12 @@ def test_enet_float_precision():
             ignore_warnings(clf.fit)(X, y)
 
             coef[dtype] = clf.coef_
+            intercept[dtype] = clf.intercept_
+
+            assert_equal(clf.coef_.dtype, dtype)
 
         assert_array_almost_equal(coef[np.float32], coef[np.float64],
                                   decimal=4)
+        assert_array_almost_equal(intercept[np.float32],
+                                  intercept[np.float64],
+                                  decimal=4)

From 14237e8dbd9d3465803900a1cb75ee918faf735a Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Mon, 22 Aug 2016 14:44:02 +0800
Subject: [PATCH 44/50] Test normalize

---
 .../tests/test_coordinate_descent.py          | 41 ++++++++++---------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py
index a513abfc9f8f8..9065c5b97dc4f 100644
--- a/sklearn/linear_model/tests/test_coordinate_descent.py
+++ b/sklearn/linear_model/tests/test_coordinate_descent.py
@@ -678,23 +678,24 @@ def test_enet_float_precision():
     # Here we have a small number of iterations, and thus the
     # ElasticNet might not converge. This is to speed up tests
 
-    for fit_intercept in [True, False]:
-        coef = {}
-        intercept = {}
-        clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False,
-                         fit_intercept=fit_intercept)
-        for dtype in [np.float64, np.float32]:
-            X = dtype(X)
-            y = dtype(y)
-            ignore_warnings(clf.fit)(X, y)
-
-            coef[dtype] = clf.coef_
-            intercept[dtype] = clf.intercept_
-
-            assert_equal(clf.coef_.dtype, dtype)
-
-        assert_array_almost_equal(coef[np.float32], coef[np.float64],
-                                  decimal=4)
-        assert_array_almost_equal(intercept[np.float32],
-                                  intercept[np.float64],
-                                  decimal=4)
+    for normalize in [True, False]:
+        for fit_intercept in [True, False]:
+            coef = {}
+            intercept = {}
+            clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False,
+                            fit_intercept=fit_intercept, normalize=normalize)
+            for dtype in [np.float64, np.float32]:
+                X = dtype(X)
+                y = dtype(y)
+                ignore_warnings(clf.fit)(X, y)
+
+                coef[dtype] = clf.coef_
+                intercept[dtype] = clf.intercept_
+
+                assert_equal(clf.coef_.dtype, dtype)
+
+            assert_array_almost_equal(coef[np.float32], coef[np.float64],
+                                    decimal=4)
+            assert_array_almost_equal(intercept[np.float32],
+                                    intercept[np.float64],
+                                    decimal=4)

From b4b9cf172ee64b4764f4e52df963fd594a283357 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 23 Aug 2016 00:27:25 +0800
Subject: [PATCH 45/50] Delete warning

---
 sklearn/linear_model/coordinate_descent.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 529e6c8380ebd..31a1649830e11 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -473,9 +473,7 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         if dual_gap_ > eps_:
             warnings.warn('Objective did not converge.' +
                           ' You might want' +
-                          ' to increase the number of iterations.' +
-                          ' Fitting float32 data with small alpha,' +
-                          ' e.g., 1e-8, may cause precision issues.',
+                          ' to increase the number of iterations.',
                           ConvergenceWarning)
 
         if verbose:
@@ -672,9 +670,9 @@ def fit(self, X, y, check_input=True):
         # when bypassing checks
         if check_input:
             X, y = check_X_y(X, y, accept_sparse='csc',
-                                order='F', dtype=[np.float64, np.float32],
-                                copy=self.copy_X and self.fit_intercept,
-                                multi_output=True, y_numeric=True)
+                             order='F', dtype=[np.float64, np.float32],
+                             copy=self.copy_X and self.fit_intercept,
+                             multi_output=True, y_numeric=True)
             y = check_array(y, order='F', copy=False, dtype=X.dtype.type,
                             ensure_2d=False)
 

From 9348ad7961166fbfc3b68d4838b4ff4b491e7a98 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 23 Aug 2016 08:47:04 +0800
Subject: [PATCH 46/50] Fix comment

---
 sklearn/linear_model/coordinate_descent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 31a1649830e11..c0309931804c2 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -666,7 +666,7 @@ def fit(self, X, y, check_input=True):
             raise ValueError('precompute should be one of True, False or'
                              ' array-like. Got %r' % self.precompute)
 
-        # We expect X and y to be already float64 Fortran ordered arrays
+        # We expect X and y to be float64 or float32 Fortran ordered arrays
         # when bypassing checks
         if check_input:
             X, y = check_X_y(X, y, accept_sparse='csc',

From 82fdf0962e3c7b0965b54ca137a56ab6d01fc226 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Tue, 23 Aug 2016 19:16:43 +0800
Subject: [PATCH 47/50] Add error msg

---
 sklearn/linear_model/coordinate_descent.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index c0309931804c2..8ca55450abcb7 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -473,7 +473,9 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
         if dual_gap_ > eps_:
             warnings.warn('Objective did not converge.' +
                           ' You might want' +
-                          ' to increase the number of iterations.',
+                          ' to increase the number of iterations.' +
+                          ' Fitting data with alpha near zero, e.g., 1e-8,' +
+                          ' may cause precision problems.',
                           ConvergenceWarning)
 
         if verbose:

From d0b56bb87daafa1c9e9cff912f9f29037b4c9ccb Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Wed, 24 Aug 2016 20:03:07 +0800
Subject: [PATCH 48/50] Add error msg

---
 sklearn/linear_model/coordinate_descent.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index 8ca55450abcb7..ae3bee0ff9878 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -474,7 +474,8 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
             warnings.warn('Objective did not converge.' +
                           ' You might want' +
                           ' to increase the number of iterations.' +
-                          ' Fitting data with alpha near zero, e.g., 1e-8,' +
+                          ' Fitting data with alpha near zero,' +
+                          ' e.g., 1e-8,' +
                           ' may cause precision problems.',
                           ConvergenceWarning)
 

From 611b41297d19af1a5ea9fe10a5348f7abdee3c89 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Wed, 24 Aug 2016 20:08:10 +0800
Subject: [PATCH 49/50] Add what's new

---
 doc/whats_new.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 2efbd87ac3b10..f04ae4cc41eca 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -263,6 +263,12 @@ Enhancements
      generating attribute ``estimators_samples_`` only when it is needed.
      By `David Staub`_.
 
+   - :class:`linear_model.ElasticNet` and :class:`linear_model.Lasso`
+     now works with ``np.float32`` input data without converting it
+     into ``np.float64``. This allows to reduce the memory
+     consumption.
+     (`#6913 <https://github.com/scikit-learn/scikit-learn/pull/6913>`_)
+     By `YenChen Lin`_.
 
 Bug fixes
 .........

From 00cadb663c25a992d50bf1aa7aceef75a53716a5 Mon Sep 17 00:00:00 2001
From: YenChenLin <yenchenlin1994@gmail.com>
Date: Thu, 25 Aug 2016 23:24:24 +0800
Subject: [PATCH 50/50] Fix error msg

---
 sklearn/linear_model/coordinate_descent.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py
index ae3bee0ff9878..45d8b265324d9 100644
--- a/sklearn/linear_model/coordinate_descent.py
+++ b/sklearn/linear_model/coordinate_descent.py
@@ -474,8 +474,7 @@ def enet_path(X, y, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None,
             warnings.warn('Objective did not converge.' +
                           ' You might want' +
                           ' to increase the number of iterations.' +
-                          ' Fitting data with alpha near zero,' +
-                          ' e.g., 1e-8,' +
+                          ' Fitting data with very small alpha' +
                           ' may cause precision problems.',
                           ConvergenceWarning)