diff --git a/numpy/linalg/lapack_lite/f2c_c_lapack.c b/numpy/linalg/lapack_lite/f2c_c_lapack.c
index 25221ba55fdb..f52e1e1572c9 100644
--- a/numpy/linalg/lapack_lite/f2c_c_lapack.c
+++ b/numpy/linalg/lapack_lite/f2c_c_lapack.c
@@ -37,19 +37,20 @@ static integer c__3 = 3;
 static integer c__2 = 2;
 static integer c__0 = 0;
 static integer c__65 = 65;
-static real c_b894 = 1.f;
+static integer c__9 = 9;
+static integer c__6 = 6;
+static real c_b328 = 0.f;
+static real c_b1034 = 1.f;
 static integer c__12 = 12;
 static integer c__49 = 49;
-static real c_b1087 = 0.f;
-static integer c__9 = 9;
-static real c_b1136 = -1.f;
+static real c_b1276 = -1.f;
 static integer c__13 = 13;
 static integer c__15 = 15;
 static integer c__14 = 14;
 static integer c__16 = 16;
 static logical c_false = FALSE_;
 static logical c_true = TRUE_;
-static real c_b2023 = .5f;
+static real c_b2435 = .5f;
 
 /* Subroutine */ int cgebak_(char *job, char *side, integer *n, integer *ilo,
 	integer *ihi, real *scale, integer *m, complex *v, integer *ldv,
@@ -2684,176 +2685,764 @@ static real c_b2023 = .5f;
 
 } /* cgelqf_ */
 
-/* Subroutine */ int cgeqr2_(integer *m, integer *n, complex *a, integer *lda,
-	 complex *tau, complex *work, integer *info)
+/* Subroutine */ int cgelsd_(integer *m, integer *n, integer *nrhs, complex *
+	a, integer *lda, complex *b, integer *ldb, real *s, real *rcond,
+	integer *rank, complex *work, integer *lwork, real *rwork, integer *
+	iwork, integer *info)
 {
     /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3;
-    complex q__1;
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
 
     /* Local variables */
-    static integer i__, k;
-    static complex alpha;
-    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
-	    , integer *, complex *, complex *, integer *, complex *),
-	    clarfg_(integer *, complex *, complex *, integer *, complex *),
-	    xerbla_(char *, integer *);
+    static integer ie, il, mm;
+    static real eps, anrm, bnrm;
+    static integer itau, nlvl, iascl, ibscl;
+    static real sfmin;
+    static integer minmn, maxmn, itaup, itauq, mnthr, nwork;
+    extern /* Subroutine */ int cgebrd_(integer *, integer *, complex *,
+	    integer *, real *, real *, complex *, complex *, complex *,
+	    integer *, integer *), slabad_(real *, real *);
+    extern doublereal clange_(char *, integer *, integer *, complex *,
+	    integer *, real *);
+    extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *,
+	    integer *, complex *, complex *, integer *, integer *), clalsd_(
+	    char *, integer *, integer *, integer *, real *, real *, complex *
+	    , integer *, real *, integer *, complex *, real *, integer *,
+	    integer *), clascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, complex *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer *,
+	    complex *, complex *, integer *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex
+	    *, integer *, complex *, integer *), claset_(char *,
+	    integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static real bignum;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *), cunmbr_(char *, char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *), slaset_(
+	    char *, integer *, integer *, real *, real *, real *, integer *), cunmlq_(char *, char *, integer *, integer *, integer *,
+	    complex *, integer *, complex *, complex *, integer *, complex *,
+	    integer *, integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, complex *, integer *,
+	    complex *, integer *, integer *);
+    static integer liwork, minwrk, maxwrk;
+    static real smlnum;
+    static integer lrwork;
+    static logical lquery;
+    static integer nrwork, smlsiz;
 
 
 /*
-    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK driver routine (version 3.2) --
     -- LAPACK is a software package provided by Univ. of Tennessee,    --
     -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-       June 2010
+       November 2006
 
 
     Purpose
     =======
 
-    CGEQR2 computes a QR factorization of a complex m by n matrix A:
-    A = Q * R.
+    CGELSD computes the minimum-norm solution to a real linear least
+    squares problem:
+        minimize 2-norm(| b - A*x |)
+    using the singular value decomposition (SVD) of A. A is an M-by-N
+    matrix which may be rank-deficient.
+
+    Several right hand side vectors b and solution vectors x can be
+    handled in a single call; they are stored as the columns of the
+    M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+    matrix X.
+
+    The problem is solved in three steps:
+    (1) Reduce the coefficient matrix A to bidiagonal form with
+        Householder tranformations, reducing the original problem
+        into a "bidiagonal least squares problem" (BLS)
+    (2) Solve the BLS using a divide and conquer approach.
+    (3) Apply back all the Householder tranformations to solve
+        the original least squares problem.
+
+    The effective rank of A is determined by treating as zero those
+    singular values which are less than RCOND times the largest singular
+    value.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
 
     Arguments
     =========
 
     M       (input) INTEGER
-            The number of rows of the matrix A.  M >= 0.
+            The number of rows of the matrix A. M >= 0.
 
     N       (input) INTEGER
-            The number of columns of the matrix A.  N >= 0.
+            The number of columns of the matrix A. N >= 0.
+
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrices B and X. NRHS >= 0.
 
     A       (input/output) COMPLEX array, dimension (LDA,N)
-            On entry, the m by n matrix A.
-            On exit, the elements on and above the diagonal of the array
-            contain the min(m,n) by n upper trapezoidal matrix R (R is
-            upper triangular if m >= n); the elements below the diagonal,
-            with the array TAU, represent the unitary matrix Q as a
-            product of elementary reflectors (see Further Details).
+            On entry, the M-by-N matrix A.
+            On exit, A has been destroyed.
 
     LDA     (input) INTEGER
-            The leading dimension of the array A.  LDA >= max(1,M).
+            The leading dimension of the array A. LDA >= max(1,M).
 
-    TAU     (output) COMPLEX array, dimension (min(M,N))
-            The scalar factors of the elementary reflectors (see Further
-            Details).
+    B       (input/output) COMPLEX array, dimension (LDB,NRHS)
+            On entry, the M-by-NRHS right hand side matrix B.
+            On exit, B is overwritten by the N-by-NRHS solution matrix X.
+            If m >= n and RANK = n, the residual sum-of-squares for
+            the solution in the i-th column is given by the sum of
+            squares of the modulus of elements n+1:m in that column.
 
-    WORK    (workspace) COMPLEX array, dimension (N)
+    LDB     (input) INTEGER
+            The leading dimension of the array B.  LDB >= max(1,M,N).
 
-    INFO    (output) INTEGER
-            = 0: successful exit
-            < 0: if INFO = -i, the i-th argument had an illegal value
+    S       (output) REAL array, dimension (min(M,N))
+            The singular values of A in decreasing order.
+            The condition number of A in the 2-norm = S(1)/S(min(m,n)).
 
-    Further Details
-    ===============
+    RCOND   (input) REAL
+            RCOND is used to determine the effective rank of A.
+            Singular values S(i) <= RCOND*S(1) are treated as zero.
+            If RCOND < 0, machine precision is used instead.
 
-    The matrix Q is represented as a product of elementary reflectors
+    RANK    (output) INTEGER
+            The effective rank of A, i.e., the number of singular values
+            which are greater than RCOND*S(1).
 
-       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 
-    Each H(i) has the form
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK must be at least 1.
+            The exact minimum amount of workspace needed depends on M,
+            N and NRHS. As long as LWORK is at least
+                2 * N + N * NRHS
+            if M is greater than or equal to N or
+                2 * M + M * NRHS
+            if M is less than N, the code will execute correctly.
+            For good performance, LWORK should generally be larger.
 
-       H(i) = I - tau * v * v'
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the array WORK and the
+            minimum sizes of the arrays RWORK and IWORK, and returns
+            these values as the first entries of the WORK, RWORK and
+            IWORK arrays, and no error message related to LWORK is issued
+            by XERBLA.
 
-    where tau is a complex scalar, and v is a complex vector with
-    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
-    and tau in TAU(i).
+    RWORK   (workspace) REAL array, dimension (MAX(1,LRWORK))
+            LRWORK >=
+               10*N + 2*N*SMLSIZ + 8*N*NLVL + 3*SMLSIZ*NRHS +
+               MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS )
+            if M is greater than or equal to N or
+               10*M + 2*M*SMLSIZ + 8*M*NLVL + 3*SMLSIZ*NRHS +
+               MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS )
+            if M is less than N, the code will execute correctly.
+            SMLSIZ is returned by ILAENV and is equal to the maximum
+            size of the subproblems at the bottom of the computation
+            tree (usually about 25), and
+               NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 )
+            On exit, if INFO = 0, RWORK(1) returns the minimum LRWORK.
+
+    IWORK   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
+            LIWORK >= max(1, 3*MINMN*NLVL + 11*MINMN),
+            where MINMN = MIN( M,N ).
+            On exit, if INFO = 0, IWORK(1) returns the minimum LIWORK.
+
+    INFO    (output) INTEGER
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value.
+            > 0:  the algorithm for computing the SVD failed to converge;
+                  if INFO = i, i off-diagonal elements of an intermediate
+                  bidiagonal form did not converge to zero.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
 
     =====================================================================
 
 
-       Test the input arguments
+       Test the input arguments.
 */
 
     /* Parameter adjustments */
     a_dim1 = *lda;
     a_offset = 1 + a_dim1;
     a -= a_offset;
-    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --s;
     --work;
+    --rwork;
+    --iwork;
 
     /* Function Body */
     *info = 0;
+    minmn = min(*m,*n);
+    maxmn = max(*m,*n);
+    lquery = *lwork == -1;
     if (*m < 0) {
 	*info = -1;
     } else if (*n < 0) {
 	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
     } else if (*lda < max(1,*m)) {
-	*info = -4;
-    }
-    if (*info != 0) {
-	i__1 = -(*info);
-	xerbla_("CGEQR2", &i__1);
-	return 0;
+	*info = -5;
+    } else if (*ldb < max(1,maxmn)) {
+	*info = -7;
     }
 
-    k = min(*m,*n);
+/*
+       Compute workspace.
+       (Note: Comments in the code beginning "Workspace:" describe the
+       minimal amount of workspace needed at that point in the code,
+       as well as the preferred amount for good performance.
+       NB refers to the optimal block size for the immediately
+       following subroutine, as returned by ILAENV.)
+*/
 
-    i__1 = k;
-    for (i__ = 1; i__ <= i__1; ++i__) {
+    if (*info == 0) {
+	minwrk = 1;
+	maxwrk = 1;
+	liwork = 1;
+	lrwork = 1;
+	if (minmn > 0) {
+	    smlsiz = ilaenv_(&c__9, "CGELSD", " ", &c__0, &c__0, &c__0, &c__0,
+		     (ftnlen)6, (ftnlen)1);
+	    mnthr = ilaenv_(&c__6, "CGELSD", " ", m, n, nrhs, &c_n1, (ftnlen)
+		    6, (ftnlen)1);
+/* Computing MAX */
+	    i__1 = (integer) (log((real) minmn / (real) (smlsiz + 1)) / log(
+		    2.f)) + 1;
+	    nlvl = max(i__1,0);
+	    liwork = minmn * 3 * nlvl + minmn * 11;
+	    mm = *m;
+	    if (*m >= *n && *m >= mnthr) {
 
-/*        Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+/*
+                Path 1a - overdetermined, with many more rows than
+                          columns.
+*/
 
-	i__2 = *m - i__ + 1;
-/* Computing MIN */
-	i__3 = i__ + 1;
-	clarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1]
-		, &c__1, &tau[i__]);
-	if (i__ < *n) {
+		mm = *n;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n * ilaenv_(&c__1, "CGEQRF", " ", m, n,
+			 &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *nrhs * ilaenv_(&c__1, "CUNMQR", "LC",
+			m, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)2);
+		maxwrk = max(i__1,i__2);
+	    }
+	    if (*m >= *n) {
 
-/*           Apply H(i)' to A(i:m,i+1:n) from the left */
+/*
+                Path 1 - overdetermined or exactly determined.
 
-	    i__2 = i__ + i__ * a_dim1;
-	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
-	    i__2 = i__ + i__ * a_dim1;
-	    a[i__2].r = 1.f, a[i__2].i = 0.f;
-	    i__2 = *m - i__ + 1;
-	    i__3 = *n - i__;
-	    r_cnjg(&q__1, &tau[i__]);
-	    clarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &q__1,
-		     &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
-	    i__2 = i__ + i__ * a_dim1;
-	    a[i__2].r = alpha.r, a[i__2].i = alpha.i;
+   Computing MAX
+   Computing 2nd power
+*/
+		i__3 = smlsiz + 1;
+		i__1 = i__3 * i__3, i__2 = *n * (*nrhs + 1) + (*nrhs << 1);
+		lrwork = *n * 10 + (*n << 1) * smlsiz + (*n << 3) * nlvl +
+			smlsiz * 3 * *nrhs + max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (mm + *n) * ilaenv_(&c__1,
+			"CGEBRD", " ", &mm, n, &c_n1, &c_n1, (ftnlen)6, (
+			ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + *nrhs * ilaenv_(&c__1,
+			"CUNMBR", "QLC", &mm, nrhs, n, &c_n1, (ftnlen)6, (
+			ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + (*n - 1) * ilaenv_(&c__1,
+			"CUNMBR", "PLN", n, nrhs, n, &c_n1, (ftnlen)6, (
+			ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = (*n << 1) + *n * *nrhs;
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = (*n << 1) + mm, i__2 = (*n << 1) + *n * *nrhs;
+		minwrk = max(i__1,i__2);
+	    }
+	    if (*n > *m) {
+/*
+   Computing MAX
+   Computing 2nd power
+*/
+		i__3 = smlsiz + 1;
+		i__1 = i__3 * i__3, i__2 = *n * (*nrhs + 1) + (*nrhs << 1);
+		lrwork = *m * 10 + (*m << 1) * smlsiz + (*m << 3) * nlvl +
+			smlsiz * 3 * *nrhs + max(i__1,i__2);
+		if (*n >= mnthr) {
+
+/*
+                   Path 2a - underdetermined, with many more columns
+                             than rows.
+*/
+
+		    maxwrk = *m + *m * ilaenv_(&c__1, "CGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m << 1) *
+			    ilaenv_(&c__1, "CGEBRD", " ", m, m, &c_n1, &c_n1,
+			    (ftnlen)6, (ftnlen)1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *nrhs *
+			    ilaenv_(&c__1, "CUNMBR", "QLC", m, nrhs, m, &c_n1,
+			     (ftnlen)6, (ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m - 1) *
+			    ilaenv_(&c__1, "CUNMLQ", "LC", n, nrhs, m, &c_n1,
+			    (ftnlen)6, (ftnlen)2);
+		    maxwrk = max(i__1,i__2);
+		    if (*nrhs > 1) {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + *m + *m * *nrhs;
+			maxwrk = max(i__1,i__2);
+		    } else {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + (*m << 1);
+			maxwrk = max(i__1,i__2);
+		    }
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *m * *nrhs;
+		    maxwrk = max(i__1,i__2);
+/*
+       XXX: Ensure the Path 2a case below is triggered.  The workspace
+       calculation should use queries for all routines eventually.
+   Computing MAX
+   Computing MAX
+*/
+		    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4),
+			    i__3 = max(i__3,*nrhs), i__4 = *n - *m * 3;
+		    i__1 = maxwrk, i__2 = (*m << 2) + *m * *m + max(i__3,i__4)
+			    ;
+		    maxwrk = max(i__1,i__2);
+		} else {
+
+/*                 Path 2 - underdetermined. */
+
+		    maxwrk = (*m << 1) + (*n + *m) * ilaenv_(&c__1, "CGEBRD",
+			    " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *nrhs * ilaenv_(&c__1,
+			    "CUNMBR", "QLC", m, nrhs, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1,
+			    "CUNMBR", "PLN", n, nrhs, m, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = (*m << 1) + *m * *nrhs;
+		    maxwrk = max(i__1,i__2);
+		}
+/* Computing MAX */
+		i__1 = (*m << 1) + *n, i__2 = (*m << 1) + *m * *nrhs;
+		minwrk = max(i__1,i__2);
+	    }
+	}
+	minwrk = min(minwrk,maxwrk);
+	work[1].r = (real) maxwrk, work[1].i = 0.f;
+	iwork[1] = liwork;
+	rwork[1] = (real) lrwork;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
 	}
-/* L10: */
     }
-    return 0;
 
-/*     End of CGEQR2 */
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGELSD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
 
-} /* cgeqr2_ */
+/*     Quick return if possible. */
 
-/* Subroutine */ int cgeqrf_(integer *m, integer *n, complex *a, integer *lda,
-	 complex *tau, complex *work, integer *lwork, integer *info)
-{
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    if (*m == 0 || *n == 0) {
+	*rank = 0;
+	return 0;
+    }
 
-    /* Local variables */
-    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
-    extern /* Subroutine */ int cgeqr2_(integer *, integer *, complex *,
-	    integer *, complex *, complex *, integer *), clarfb_(char *, char
-	    *, char *, char *, integer *, integer *, integer *, complex *,
-	    integer *, complex *, integer *, complex *, integer *, complex *,
-	    integer *), clarft_(char *, char *
-	    , integer *, integer *, complex *, integer *, complex *, complex *
-	    , integer *), xerbla_(char *, integer *);
-    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
-	    integer *, integer *, ftnlen, ftnlen);
-    static integer ldwork, lwkopt;
-    static logical lquery;
+/*     Get machine parameters. */
 
+    eps = slamch_("P");
+    sfmin = slamch_("S");
+    smlnum = sfmin / eps;
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
 
-/*
-    -- LAPACK routine (version 3.2) --
-    -- LAPACK is a software package provided by Univ. of Tennessee,    --
-    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-       November 2006
+/*     Scale A if max entry outside range [SMLNUM,BIGNUM]. */
 
+    anrm = clange_("M", m, n, &a[a_offset], lda, &rwork[1]);
+    iascl = 0;
+    if (anrm > 0.f && anrm < smlnum) {
 
-    Purpose
-    =======
+/*        Scale matrix norm up to SMLNUM */
 
-    CGEQRF computes a QR factorization of a complex M-by-N matrix A:
+	clascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	clascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 2;
+    } else if (anrm == 0.f) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = max(*m,*n);
+	claset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[b_offset], ldb);
+	slaset_("F", &minmn, &c__1, &c_b328, &c_b328, &s[1], &c__1)
+		;
+	*rank = 0;
+	goto L10;
+    }
+
+/*     Scale B if max entry outside range [SMLNUM,BIGNUM]. */
+
+    bnrm = clange_("M", m, nrhs, &b[b_offset], ldb, &rwork[1]);
+    ibscl = 0;
+    if (bnrm > 0.f && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM. */
+
+	clascl_("G", &c__0, &c__0, &bnrm, &smlnum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	clascl_("G", &c__0, &c__0, &bnrm, &bignum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 2;
+    }
+
+/*     If M < N make sure B(M+1:N,:) = 0 */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	claset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     Overdetermined case. */
+
+    if (*m >= *n) {
+
+/*        Path 1 - overdetermined or exactly determined. */
+
+	mm = *m;
+	if (*m >= mnthr) {
+
+/*           Path 1a - overdetermined, with many more rows than columns */
+
+	    mm = *n;
+	    itau = 1;
+	    nwork = itau + *n;
+
+/*
+             Compute A=Q*R.
+             (RWorkspace: need N)
+             (CWorkspace: need N, prefer N*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+
+/*
+             Multiply B by transpose(Q).
+             (RWorkspace: need N)
+             (CWorkspace: need NRHS, prefer NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmqr_("L", "C", m, nrhs, n, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+/*           Zero out below R. */
+
+	    if (*n > 1) {
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		claset_("L", &i__1, &i__2, &c_b56, &c_b56, &a[a_dim1 + 2],
+			lda);
+	    }
+	}
+
+	itauq = 1;
+	itaup = itauq + *n;
+	nwork = itaup + *n;
+	ie = 1;
+	nrwork = ie + *n;
+
+/*
+          Bidiagonalize R in A.
+          (RWorkspace: need N)
+          (CWorkspace: need 2*N+MM, prefer 2*N+(MM+N)*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	cgebrd_(&mm, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq], &
+		work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of R.
+          (CWorkspace: need 2*N+NRHS, prefer 2*N+NRHS*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	cunmbr_("Q", "L", "C", &mm, nrhs, n, &a[a_offset], lda, &work[itauq],
+		&b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	clalsd_("U", &smlsiz, n, nrhs, &s[1], &rwork[ie], &b[b_offset], ldb,
+		rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1], info);
+	if (*info != 0) {
+	    goto L10;
+	}
+
+/*        Multiply B by right bidiagonalizing vectors of R. */
+
+	i__1 = *lwork - nwork + 1;
+	cunmbr_("P", "L", "N", n, nrhs, n, &a[a_offset], lda, &work[itaup], &
+		b[b_offset], ldb, &work[nwork], &i__1, info);
+
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = *m, i__2 = (*m << 1) - 4, i__1 = max(i__1,i__2), i__1 = max(
+		i__1,*nrhs), i__2 = *n - *m * 3;
+	if (*n >= mnthr && *lwork >= (*m << 2) + *m * *m + max(i__1,i__2)) {
+
+/*
+          Path 2a - underdetermined, with many more columns than rows
+          and sufficient workspace for an efficient algorithm.
+*/
+
+	    ldwork = *m;
+/*
+   Computing MAX
+   Computing MAX
+*/
+	    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), i__3 =
+		    max(i__3,*nrhs), i__4 = *n - *m * 3;
+	    i__1 = (*m << 2) + *m * *lda + max(i__3,i__4), i__2 = *m * *lda +
+		    *m + *m * *nrhs;
+	    if (*lwork >= max(i__1,i__2)) {
+		ldwork = *lda;
+	    }
+	    itau = 1;
+	    nwork = *m + 1;
+
+/*
+          Compute A=L*Q.
+          (CWorkspace: need 2*M, prefer M+M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+	    il = nwork;
+
+/*        Copy L to WORK(IL), zeroing out above its diagonal. */
+
+	    clacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwork);
+	    i__1 = *m - 1;
+	    i__2 = *m - 1;
+	    claset_("U", &i__1, &i__2, &c_b56, &c_b56, &work[il + ldwork], &
+		    ldwork);
+	    itauq = il + ldwork * *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+	    ie = 1;
+	    nrwork = ie + *m;
+
+/*
+          Bidiagonalize L in WORK(IL).
+          (RWorkspace: need M)
+          (CWorkspace: need M*M+4*M, prefer M*M+4*M+2*M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgebrd_(m, m, &work[il], &ldwork, &s[1], &rwork[ie], &work[itauq],
+		     &work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of L.
+          (CWorkspace: need M*M+4*M+NRHS, prefer M*M+4*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmbr_("Q", "L", "C", m, nrhs, m, &work[il], &ldwork, &work[
+		    itauq], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    clalsd_("U", &smlsiz, m, nrhs, &s[1], &rwork[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1],
+		     info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of L. */
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmbr_("P", "L", "N", m, nrhs, m, &work[il], &ldwork, &work[
+		    itaup], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Zero out below first M rows of B. */
+
+	    i__1 = *n - *m;
+	    claset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[*m + 1 + b_dim1],
+		    ldb);
+	    nwork = itau + *m;
+
+/*
+          Multiply transpose(Q) by B.
+          (CWorkspace: need NRHS, prefer NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmlq_("L", "C", n, nrhs, m, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+	} else {
+
+/*        Path 2 - remaining underdetermined cases. */
+
+	    itauq = 1;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+	    ie = 1;
+	    nrwork = ie + *m;
+
+/*
+          Bidiagonalize A.
+          (RWorkspace: need M)
+          (CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors.
+          (CWorkspace: need 2*M+NRHS, prefer 2*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmbr_("Q", "L", "C", m, nrhs, n, &a[a_offset], lda, &work[itauq]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    clalsd_("L", &smlsiz, m, nrhs, &s[1], &rwork[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1],
+		     info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of A. */
+
+	    i__1 = *lwork - nwork + 1;
+	    cunmbr_("P", "L", "N", n, nrhs, m, &a[a_offset], lda, &work[itaup]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+	}
+    }
+
+/*     Undo scaling. */
+
+    if (iascl == 1) {
+	clascl_("G", &c__0, &c__0, &anrm, &smlnum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	slascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    } else if (iascl == 2) {
+	clascl_("G", &c__0, &c__0, &anrm, &bignum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	slascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    }
+    if (ibscl == 1) {
+	clascl_("G", &c__0, &c__0, &smlnum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    } else if (ibscl == 2) {
+	clascl_("G", &c__0, &c__0, &bignum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    }
+
+L10:
+    work[1].r = (real) maxwrk, work[1].i = 0.f;
+    iwork[1] = liwork;
+    rwork[1] = (real) lrwork;
+    return 0;
+
+/*     End of CGELSD */
+
+} /* cgelsd_ */
+
+/* Subroutine */ int cgeqr2_(integer *m, integer *n, complex *a, integer *lda,
+	 complex *tau, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, k;
+    static complex alpha;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *),
+	    clarfg_(integer *, complex *, complex *, integer *, complex *),
+	    xerbla_(char *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    CGEQR2 computes a QR factorization of a complex m by n matrix A:
     A = Q * R.
 
     Arguments
@@ -2866,13 +3455,12 @@ static real c_b2023 = .5f;
             The number of columns of the matrix A.  N >= 0.
 
     A       (input/output) COMPLEX array, dimension (LDA,N)
-            On entry, the M-by-N matrix A.
+            On entry, the m by n matrix A.
             On exit, the elements on and above the diagonal of the array
-            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
+            contain the min(m,n) by n upper trapezoidal matrix R (R is
             upper triangular if m >= n); the elements below the diagonal,
             with the array TAU, represent the unitary matrix Q as a
-            product of min(m,n) elementary reflectors (see Further
-            Details).
+            product of elementary reflectors (see Further Details).
 
     LDA     (input) INTEGER
             The leading dimension of the array A.  LDA >= max(1,M).
@@ -2881,22 +3469,11 @@ static real c_b2023 = .5f;
             The scalar factors of the elementary reflectors (see Further
             Details).
 
-    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
-            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
-
-    LWORK   (input) INTEGER
-            The dimension of the array WORK.  LWORK >= max(1,N).
-            For optimum performance LWORK >= N*NB, where NB is
-            the optimal blocksize.
-
-            If LWORK = -1, then a workspace query is assumed; the routine
-            only calculates the optimal size of the WORK array, returns
-            this value as the first entry of the WORK array, and no error
-            message related to LWORK is issued by XERBLA.
+    WORK    (workspace) COMPLEX array, dimension (N)
 
     INFO    (output) INTEGER
-            = 0:  successful exit
-            < 0:  if INFO = -i, the i-th argument had an illegal value
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
 
     Further Details
     ===============
@@ -2928,63 +3505,218 @@ static real c_b2023 = .5f;
 
     /* Function Body */
     *info = 0;
-    nb = ilaenv_(&c__1, "CGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
-	    1);
-    lwkopt = *n * nb;
-    work[1].r = (real) lwkopt, work[1].i = 0.f;
-    lquery = *lwork == -1;
     if (*m < 0) {
 	*info = -1;
     } else if (*n < 0) {
 	*info = -2;
     } else if (*lda < max(1,*m)) {
 	*info = -4;
-    } else if (*lwork < max(1,*n) && ! lquery) {
-	*info = -7;
     }
     if (*info != 0) {
 	i__1 = -(*info);
-	xerbla_("CGEQRF", &i__1);
-	return 0;
-    } else if (lquery) {
+	xerbla_("CGEQR2", &i__1);
 	return 0;
     }
 
-/*     Quick return if possible */
-
     k = min(*m,*n);
-    if (k == 0) {
-	work[1].r = 1.f, work[1].i = 0.f;
-	return 0;
-    }
-
-    nbmin = 2;
-    nx = 0;
-    iws = *n;
-    if (nb > 1 && nb < k) {
 
-/*
-          Determine when to cross over from blocked to unblocked code.
-
-   Computing MAX
-*/
-	i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQRF", " ", m, n, &c_n1, &c_n1, (
-		ftnlen)6, (ftnlen)1);
-	nx = max(i__1,i__2);
-	if (nx < k) {
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
 
-/*           Determine if workspace is large enough for blocked code. */
+/*        Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
 
-	    ldwork = *n;
-	    iws = ldwork * nb;
-	    if (*lwork < iws) {
+	i__2 = *m - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	clarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1]
+		, &c__1, &tau[i__]);
+	if (i__ < *n) {
 
-/*
-                Not enough workspace to use optimal NB:  reduce NB and
-                determine the minimum value of NB.
-*/
+/*           Apply H(i)' to A(i:m,i+1:n) from the left */
 
-		nb = *lwork / ldwork;
+	    i__2 = i__ + i__ * a_dim1;
+	    alpha.r = a[i__2].r, alpha.i = a[i__2].i;
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n - i__;
+	    r_cnjg(&q__1, &tau[i__]);
+	    clarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &q__1,
+		     &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	    i__2 = i__ + i__ * a_dim1;
+	    a[i__2].r = alpha.r, a[i__2].i = alpha.i;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of CGEQR2 */
+
+} /* cgeqr2_ */
+
+/* Subroutine */ int cgeqrf_(integer *m, integer *n, complex *a, integer *lda,
+	 complex *tau, complex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int cgeqr2_(integer *, integer *, complex *,
+	    integer *, complex *, complex *, integer *), clarfb_(char *, char
+	    *, char *, char *, integer *, integer *, integer *, complex *,
+	    integer *, complex *, integer *, complex *, integer *, complex *,
+	    integer *), clarft_(char *, char *
+	    , integer *, integer *, complex *, integer *, complex *, complex *
+	    , integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CGEQRF computes a QR factorization of a complex M-by-N matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) COMPLEX array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the unitary matrix Q as a
+            product of min(m,n) elementary reflectors (see Further
+            Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) COMPLEX array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) COMPLEX array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a complex scalar, and v is a complex vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "CGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *n * nb;
+    work[1].r = (real) lwkopt, work[1].i = 0.f;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEQRF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1].r = 1.f, work[1].i = 0.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQRF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
 /* Computing MAX */
 		i__1 = 2, i__2 = ilaenv_(&c__2, "CGEQRF", " ", m, n, &c_n1, &
 			c_n1, (ftnlen)6, (ftnlen)1);
@@ -6471,8 +7203,8 @@ static real c_b2023 = .5f;
 	sigma = rmax / anrm;
     }
     if (iscale == 1) {
-	clascl_(uplo, &c__0, &c__0, &c_b894, &sigma, n, n, &a[a_offset], lda,
-		info);
+	clascl_(uplo, &c__0, &c__0, &c_b1034, &sigma, n, n, &a[a_offset], lda,
+		 info);
     }
 
 /*     Call CHETRD to reduce Hermitian matrix to tridiagonal form. */
@@ -7135,7 +7867,7 @@ static real c_b2023 = .5f;
 	    i__3 = i__ - 1;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cher2k_(uplo, "No transpose", &i__3, &nb, &q__1, &a[i__ * a_dim1
-		    + 1], lda, &work[1], &ldwork, &c_b894, &a[a_offset], lda);
+		    + 1], lda, &work[1], &ldwork, &c_b1034, &a[a_offset], lda);
 
 /*
              Copy superdiagonal elements back into A, and diagonal
@@ -7184,7 +7916,7 @@ static real c_b2023 = .5f;
 	    i__3 = *n - i__ - nb + 1;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cher2k_(uplo, "No transpose", &i__3, &nb, &q__1, &a[i__ + nb +
-		    i__ * a_dim1], lda, &work[nb + 1], &ldwork, &c_b894, &a[
+		    i__ * a_dim1], lda, &work[nb + 1], &ldwork, &c_b1034, &a[
 		    i__ + nb + (i__ + nb) * a_dim1], lda);
 
 /*
@@ -8537,8 +9269,8 @@ static real c_b2023 = .5f;
     }
 
     l = *m * *n + 1;
-    sgemm_("N", "N", m, n, n, &c_b894, &rwork[1], m, &b[b_offset], ldb, &
-	    c_b1087, &rwork[l], m);
+    sgemm_("N", "N", m, n, n, &c_b1034, &rwork[1], m, &b[b_offset], ldb, &
+	    c_b328, &rwork[l], m);
     i__1 = *n;
     for (j = 1; j <= i__1; ++j) {
 	i__2 = *m;
@@ -8560,8 +9292,8 @@ static real c_b2023 = .5f;
 	}
 /* L60: */
     }
-    sgemm_("N", "N", m, n, n, &c_b894, &rwork[1], m, &b[b_offset], ldb, &
-	    c_b1087, &rwork[l], m);
+    sgemm_("N", "N", m, n, n, &c_b1034, &rwork[1], m, &b[b_offset], ldb, &
+	    c_b328, &rwork[l], m);
     i__1 = *n;
     for (j = 1; j <= i__1; ++j) {
 	i__2 = *m;
@@ -9495,7 +10227,7 @@ static real c_b2023 = .5f;
     n1p1 = n1 + 1;
 
     if (*rho < 0.f) {
-	sscal_(&n2, &c_b1136, &z__[n1p1], &c__1);
+	sscal_(&n2, &c_b1276, &z__[n1p1], &c__1);
     }
 
 /*     Normalize z so that norm(z) = 1 */
@@ -10746,6 +11478,1932 @@ static real c_b2023 = .5f;
 
 } /* clahr2_ */
 
+/* Subroutine */ int clals0_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, integer *nrhs, complex *b, integer *ldb, complex *bx,
+	integer *ldbx, integer *perm, integer *givptr, integer *givcol,
+	integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real *
+	difl, real *difr, real *z__, integer *k, real *c__, real *s, real *
+	rwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, difr_dim1, difr_offset, givnum_dim1,
+	    givnum_offset, poles_dim1, poles_offset, b_dim1, b_offset,
+	    bx_dim1, bx_offset, i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, m, n;
+    static real dj;
+    static integer nlp1, jcol;
+    static real temp;
+    static integer jrow;
+    extern doublereal snrm2_(integer *, real *, integer *);
+    static real diflj, difrj, dsigj;
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *), sgemv_(char *, integer *, integer *, real *
+	    , real *, integer *, real *, integer *, real *, real *, integer *), csrot_(integer *, complex *, integer *, complex *,
+	    integer *, real *, real *);
+    extern doublereal slamc3_(real *, real *);
+    extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, complex *, integer *, integer *), csscal_(integer *, real *, complex *, integer *),
+	    clacpy_(char *, integer *, integer *, complex *, integer *,
+	    complex *, integer *), xerbla_(char *, integer *);
+    static real dsigjp;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLALS0 applies back the multiplying factors of either the left or the
+    right singular vector matrix of a diagonal matrix appended by a row
+    to the right hand side matrix B in solving the least squares problem
+    using the divide-and-conquer SVD approach.
+
+    For the left singular vector matrix, three types of orthogonal
+    matrices are involved:
+
+    (1L) Givens rotations: the number of such rotations is GIVPTR; the
+         pairs of columns/rows they were applied to are stored in GIVCOL;
+         and the C- and S-values of these rotations are stored in GIVNUM.
+
+    (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+         row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+         J-th row.
+
+    (3L) The left singular vector matrix of the remaining matrix.
+
+    For the right singular vector matrix, four types of orthogonal
+    matrices are involved:
+
+    (1R) The right singular vector matrix of the remaining matrix.
+
+    (2R) If SQRE = 1, one extra Givens rotation to generate the right
+         null space.
+
+    (3R) The inverse transformation of (2L).
+
+    (4R) The inverse transformation of (1L).
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed in
+           factored form:
+           = 0: Left singular vector matrix.
+           = 1: Right singular vector matrix.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block. NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block. NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) COMPLEX array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M. On output, B contains
+           the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B. LDB must be at least
+           max(1,MAX( M, N ) ).
+
+    BX     (workspace) COMPLEX array, dimension ( LDBX, NRHS )
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    PERM   (input) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) applied
+           to the two blocks.
+
+    GIVPTR (input) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of rows/columns
+           involved in a Givens rotation.
+
+    LDGCOL (input) INTEGER
+           The leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (input) REAL array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value used in the
+           corresponding Givens rotation.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of arrays DIFR, POLES and
+           GIVNUM, must be at least K.
+
+    POLES  (input) REAL array, dimension ( LDGNUM, 2 )
+           On entry, POLES(1:K, 1) contains the new singular
+           values obtained from solving the secular equation, and
+           POLES(1:K, 2) is an array containing the poles in the secular
+           equation.
+
+    DIFL   (input) REAL array, dimension ( K ).
+           On entry, DIFL(I) is the distance between I-th updated
+           (undeflated) singular value and the I-th (undeflated) old
+           singular value.
+
+    DIFR   (input) REAL array, dimension ( LDGNUM, 2 ).
+           On entry, DIFR(I, 1) contains the distances between I-th
+           updated (undeflated) singular value and the I+1-th
+           (undeflated) old singular value. And DIFR(I, 2) is the
+           normalizing factor for the I-th right singular vector.
+
+    Z      (input) REAL array, dimension ( K )
+           Contain the components of the deflation-adjusted updating row
+           vector.
+
+    K      (input) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    C      (input) REAL
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (input) REAL
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    RWORK  (workspace) REAL array, dimension
+           ( K*(1+NRHS) + 2*NRHS )
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    difr_dim1 = *ldgnum;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    poles_dim1 = *ldgnum;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    --difl;
+    --z__;
+    --rwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    }
+
+    n = *nl + *nr + 1;
+
+    if (*nrhs < 1) {
+	*info = -5;
+    } else if (*ldb < n) {
+	*info = -7;
+    } else if (*ldbx < n) {
+	*info = -9;
+    } else if (*givptr < 0) {
+	*info = -11;
+    } else if (*ldgcol < n) {
+	*info = -13;
+    } else if (*ldgnum < n) {
+	*info = -15;
+    } else if (*k < 1) {
+	*info = -20;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLALS0", &i__1);
+	return 0;
+    }
+
+    m = n + *sqre;
+    nlp1 = *nl + 1;
+
+    if (*icompq == 0) {
+
+/*
+          Apply back orthogonal transformations from the left.
+
+          Step (1L): apply back the Givens rotations performed.
+*/
+
+	i__1 = *givptr;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    csrot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &givnum[i__ + givnum_dim1]);
+/* L10: */
+	}
+
+/*        Step (2L): permute rows of B. */
+
+	ccopy_(nrhs, &b[nlp1 + b_dim1], ldb, &bx[bx_dim1 + 1], ldbx);
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    ccopy_(nrhs, &b[perm[i__] + b_dim1], ldb, &bx[i__ + bx_dim1],
+		    ldbx);
+/* L20: */
+	}
+
+/*
+          Step (3L): apply the inverse of the left singular vector
+          matrix to BX.
+*/
+
+	if (*k == 1) {
+	    ccopy_(nrhs, &bx[bx_offset], ldbx, &b[b_offset], ldb);
+	    if (z__[1] < 0.f) {
+		csscal_(nrhs, &c_b1276, &b[b_offset], ldb);
+	    }
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		diflj = difl[j];
+		dj = poles[j + poles_dim1];
+		dsigj = -poles[j + (poles_dim1 << 1)];
+		if (j < *k) {
+		    difrj = -difr[j + difr_dim1];
+		    dsigjp = -poles[j + 1 + (poles_dim1 << 1)];
+		}
+		if (z__[j] == 0.f || poles[j + (poles_dim1 << 1)] == 0.f) {
+		    rwork[j] = 0.f;
+		} else {
+		    rwork[j] = -poles[j + (poles_dim1 << 1)] * z__[j] / diflj
+			    / (poles[j + (poles_dim1 << 1)] + dj);
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] ==
+			    0.f) {
+			rwork[i__] = 0.f;
+		    } else {
+			rwork[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				 / (slamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigj) - diflj) / (poles[i__ + (poles_dim1 <<
+				1)] + dj);
+		    }
+/* L30: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] ==
+			    0.f) {
+			rwork[i__] = 0.f;
+		    } else {
+			rwork[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				 / (slamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigjp) + difrj) / (poles[i__ + (poles_dim1 <<
+				 1)] + dj);
+		    }
+/* L40: */
+		}
+		rwork[1] = -1.f;
+		temp = snrm2_(k, &rwork[1], &c__1);
+
+/*
+                Since B and BX are complex, the following call to SGEMV
+                is performed in two steps (real and imaginary parts).
+
+                CALL SGEMV( 'T', K, NRHS, ONE, BX, LDBX, WORK, 1, ZERO,
+      $                     B( J, 1 ), LDB )
+*/
+
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			i__4 = jrow + jcol * bx_dim1;
+			rwork[i__] = bx[i__4].r;
+/* L50: */
+		    }
+/* L60: */
+		}
+		sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1], &c__1);
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			rwork[i__] = r_imag(&bx[jrow + jcol * bx_dim1]);
+/* L70: */
+		    }
+/* L80: */
+		}
+		sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1 + *nrhs],
+			&c__1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = j + jcol * b_dim1;
+		    i__4 = jcol + *k;
+		    i__5 = jcol + *k + *nrhs;
+		    q__1.r = rwork[i__4], q__1.i = rwork[i__5];
+		    b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L90: */
+		}
+		clascl_("G", &c__0, &c__0, &temp, &c_b1034, &c__1, nrhs, &b[j
+			+ b_dim1], ldb, info);
+/* L100: */
+	    }
+	}
+
+/*        Move the deflated rows of BX to B also. */
+
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    clacpy_("A", &i__1, nrhs, &bx[*k + 1 + bx_dim1], ldbx, &b[*k + 1
+		    + b_dim1], ldb);
+	}
+    } else {
+
+/*
+          Apply back the right orthogonal transformations.
+
+          Step (1R): apply back the new right singular vector matrix
+          to B.
+*/
+
+	if (*k == 1) {
+	    ccopy_(nrhs, &b[b_offset], ldb, &bx[bx_offset], ldbx);
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		dsigj = poles[j + (poles_dim1 << 1)];
+		if (z__[j] == 0.f) {
+		    rwork[j] = 0.f;
+		} else {
+		    rwork[j] = -z__[j] / difl[j] / (dsigj + poles[j +
+			    poles_dim1]) / difr[j + (difr_dim1 << 1)];
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.f) {
+			rwork[i__] = 0.f;
+		    } else {
+			r__1 = -poles[i__ + 1 + (poles_dim1 << 1)];
+			rwork[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difr[
+				i__ + difr_dim1]) / (dsigj + poles[i__ +
+				poles_dim1]) / difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L110: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.f) {
+			rwork[i__] = 0.f;
+		    } else {
+			r__1 = -poles[i__ + (poles_dim1 << 1)];
+			rwork[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difl[
+				i__]) / (dsigj + poles[i__ + poles_dim1]) /
+				difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L120: */
+		}
+
+/*
+                Since B and BX are complex, the following call to SGEMV
+                is performed in two steps (real and imaginary parts).
+
+                CALL SGEMV( 'T', K, NRHS, ONE, B, LDB, WORK, 1, ZERO,
+      $                     BX( J, 1 ), LDBX )
+*/
+
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			i__4 = jrow + jcol * b_dim1;
+			rwork[i__] = b[i__4].r;
+/* L130: */
+		    }
+/* L140: */
+		}
+		sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1], &c__1);
+		i__ = *k + (*nrhs << 1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = *k;
+		    for (jrow = 1; jrow <= i__3; ++jrow) {
+			++i__;
+			rwork[i__] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L150: */
+		    }
+/* L160: */
+		}
+		sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)],
+			k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1 + *nrhs],
+			&c__1);
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = j + jcol * bx_dim1;
+		    i__4 = jcol + *k;
+		    i__5 = jcol + *k + *nrhs;
+		    q__1.r = rwork[i__4], q__1.i = rwork[i__5];
+		    bx[i__3].r = q__1.r, bx[i__3].i = q__1.i;
+/* L170: */
+		}
+/* L180: */
+	    }
+	}
+
+/*
+          Step (2R): if SQRE = 1, apply back the rotation that is
+          related to the right null space of the subproblem.
+*/
+
+	if (*sqre == 1) {
+	    ccopy_(nrhs, &b[m + b_dim1], ldb, &bx[m + bx_dim1], ldbx);
+	    csrot_(nrhs, &bx[bx_dim1 + 1], ldbx, &bx[m + bx_dim1], ldbx, c__,
+		    s);
+	}
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    clacpy_("A", &i__1, nrhs, &b[*k + 1 + b_dim1], ldb, &bx[*k + 1 +
+		    bx_dim1], ldbx);
+	}
+
+/*        Step (3R): permute rows of B. */
+
+	ccopy_(nrhs, &bx[bx_dim1 + 1], ldbx, &b[nlp1 + b_dim1], ldb);
+	if (*sqre == 1) {
+	    ccopy_(nrhs, &bx[m + bx_dim1], ldbx, &b[m + b_dim1], ldb);
+	}
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    ccopy_(nrhs, &bx[i__ + bx_dim1], ldbx, &b[perm[i__] + b_dim1],
+		    ldb);
+/* L190: */
+	}
+
+/*        Step (4R): apply back the Givens rotations performed. */
+
+	for (i__ = *givptr; i__ >= 1; --i__) {
+	    r__1 = -givnum[i__ + givnum_dim1];
+	    csrot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &r__1);
+/* L200: */
+	}
+    }
+
+    return 0;
+
+/*     End of CLALS0 */
+
+} /* clals0_ */
+
+/* Subroutine */ int clalsa_(integer *icompq, integer *smlsiz, integer *n,
+	integer *nrhs, complex *b, integer *ldb, complex *bx, integer *ldbx,
+	real *u, integer *ldu, real *vt, integer *k, real *difl, real *difr,
+	real *z__, real *poles, integer *givptr, integer *givcol, integer *
+	ldgcol, integer *perm, real *givnum, real *c__, real *s, real *rwork,
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, difl_dim1,
+	    difl_offset, difr_dim1, difr_offset, givnum_dim1, givnum_offset,
+	    poles_dim1, poles_offset, u_dim1, u_offset, vt_dim1, vt_offset,
+	    z_dim1, z_offset, b_dim1, b_offset, bx_dim1, bx_offset, i__1,
+	    i__2, i__3, i__4, i__5, i__6;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, i1, ic, lf, nd, ll, nl, nr, im1, nlf, nrf, lvl,
+	    ndb1, nlp1, lvl2, nrp1, jcol, nlvl, sqre, jrow, jimag, jreal,
+	    inode, ndiml;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer ndimr;
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *), clals0_(integer *, integer *, integer *,
+	    integer *, integer *, complex *, integer *, complex *, integer *,
+	    integer *, integer *, integer *, integer *, real *, integer *,
+	    real *, real *, real *, real *, integer *, real *, real *, real *,
+	     integer *), xerbla_(char *, integer *), slasdt_(integer *
+	    , integer *, integer *, integer *, integer *, integer *, integer *
+	    );
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    CLALSA is an itermediate step in solving the least squares problem
+    by computing the SVD of the coefficient matrix in compact form (The
+    singular vectors are computed as products of simple orthorgonal
+    matrices.).
+
+    If ICOMPQ = 0, CLALSA applies the inverse of the left singular vector
+    matrix of an upper bidiagonal matrix to the right hand side; and if
+    ICOMPQ = 1, CLALSA applies the right singular vector matrix to the
+    right hand side. The singular vector matrices were generated in
+    compact form by CLALSA.
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether the left or the right singular vector
+           matrix is involved.
+           = 0: Left singular vector matrix
+           = 1: Right singular vector matrix
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The row and column dimensions of the upper bidiagonal matrix.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) COMPLEX array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M.
+           On output, B contains the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,MAX( M, N ) ).
+
+    BX     (output) COMPLEX array, dimension ( LDBX, NRHS )
+           On exit, the result of applying the left or right singular
+           vector matrix to B.
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    U      (input) REAL array, dimension ( LDU, SMLSIZ ).
+           On entry, U contains the left singular vector matrices of all
+           subproblems at the bottom level.
+
+    LDU    (input) INTEGER, LDU = > N.
+           The leading dimension of arrays U, VT, DIFL, DIFR,
+           POLES, GIVNUM, and Z.
+
+    VT     (input) REAL array, dimension ( LDU, SMLSIZ+1 ).
+           On entry, VT' contains the right singular vector matrices of
+           all subproblems at the bottom level.
+
+    K      (input) INTEGER array, dimension ( N ).
+
+    DIFL   (input) REAL array, dimension ( LDU, NLVL ).
+           where NLVL = INT(log_2 (N/(SMLSIZ+1))) + 1.
+
+    DIFR   (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, DIFL(*, I) and DIFR(*, 2 * I -1) record
+           distances between singular values on the I-th level and
+           singular values on the (I -1)-th level, and DIFR(*, 2 * I)
+           record the normalizing factors of the right singular vectors
+           matrices of subproblems on I-th level.
+
+    Z      (input) REAL array, dimension ( LDU, NLVL ).
+           On entry, Z(1, I) contains the components of the deflation-
+           adjusted updating row vector for subproblems on the I-th
+           level.
+
+    POLES  (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, POLES(*, 2 * I -1: 2 * I) contains the new and old
+           singular values involved in the secular equations on the I-th
+           level.
+
+    GIVPTR (input) INTEGER array, dimension ( N ).
+           On entry, GIVPTR( I ) records the number of Givens
+           rotations performed on the I-th problem on the computation
+           tree.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 * NLVL ).
+           On entry, for each I, GIVCOL(*, 2 * I - 1: 2 * I) records the
+           locations of Givens rotations performed on the I-th level on
+           the computation tree.
+
+    LDGCOL (input) INTEGER, LDGCOL = > N.
+           The leading dimension of arrays GIVCOL and PERM.
+
+    PERM   (input) INTEGER array, dimension ( LDGCOL, NLVL ).
+           On entry, PERM(*, I) records permutations done on the I-th
+           level of the computation tree.
+
+    GIVNUM (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, GIVNUM(*, 2 *I -1 : 2 * I) records the C- and S-
+           values of Givens rotations performed on the I-th level on the
+           computation tree.
+
+    C      (input) REAL array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           C( I ) contains the C-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    S      (input) REAL array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           S( I ) contains the S-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    RWORK  (workspace) REAL array, dimension at least
+           MAX( (SMLSZ+1)*NRHS*3, N*(1+NRHS) + 2*NRHS ).
+
+    IWORK  (workspace) INTEGER array.
+           The dimension must be at least 3 * N
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    givnum_dim1 = *ldu;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    poles_dim1 = *ldu;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    z_dim1 = *ldu;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    difr_dim1 = *ldu;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    difl_dim1 = *ldu;
+    difl_offset = 1 + difl_dim1;
+    difl -= difl_offset;
+    vt_dim1 = *ldu;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    --k;
+    --givptr;
+    perm_dim1 = *ldgcol;
+    perm_offset = 1 + perm_dim1;
+    perm -= perm_offset;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    --c__;
+    --s;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*smlsiz < 3) {
+	*info = -2;
+    } else if (*n < *smlsiz) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < *n) {
+	*info = -6;
+    } else if (*ldbx < *n) {
+	*info = -8;
+    } else if (*ldu < *n) {
+	*info = -10;
+    } else if (*ldgcol < *n) {
+	*info = -19;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLALSA", &i__1);
+	return 0;
+    }
+
+/*     Book-keeping and  setting up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+
+    slasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       The following code applies back the left singular vector factors.
+       For applying back the right singular vector factors, go to 170.
+*/
+
+    if (*icompq == 1) {
+	goto L170;
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by SLASDQ. The corresponding left and right singular vector
+       matrices are in explicit form. First apply back the left
+       singular vector matrices.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+          IC : center row of each node
+          NL : number of rows of left  subproblem
+          NR : number of rows of right subproblem
+          NLF: starting row of the left   subproblem
+          NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlf = ic - nl;
+	nrf = ic + 1;
+
+/*
+          Since B and BX are complex, the following call to SGEMM
+          is performed in two steps (real and imaginary parts).
+
+          CALL SGEMM( 'T', 'N', NL, NRHS, NL, ONE, U( NLF, 1 ), LDU,
+       $               B( NLF, 1 ), LDB, ZERO, BX( NLF, 1 ), LDBX )
+*/
+
+	j = nl * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nl - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L10: */
+	    }
+/* L20: */
+	}
+	sgemm_("T", "N", &nl, nrhs, &nl, &c_b1034, &u[nlf + u_dim1], ldu, &
+		rwork[(nl * *nrhs << 1) + 1], &nl, &c_b328, &rwork[1], &nl);
+	j = nl * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nl - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L30: */
+	    }
+/* L40: */
+	}
+	sgemm_("T", "N", &nl, nrhs, &nl, &c_b1034, &u[nlf + u_dim1], ldu, &
+		rwork[(nl * *nrhs << 1) + 1], &nl, &c_b328, &rwork[nl * *nrhs
+		+ 1], &nl);
+	jreal = 0;
+	jimag = nl * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nl - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		bx[i__4].r = q__1.r, bx[i__4].i = q__1.i;
+/* L50: */
+	    }
+/* L60: */
+	}
+
+/*
+          Since B and BX are complex, the following call to SGEMM
+          is performed in two steps (real and imaginary parts).
+
+          CALL SGEMM( 'T', 'N', NR, NRHS, NR, ONE, U( NRF, 1 ), LDU,
+      $               B( NRF, 1 ), LDB, ZERO, BX( NRF, 1 ), LDBX )
+*/
+
+	j = nr * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nr - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L70: */
+	    }
+/* L80: */
+	}
+	sgemm_("T", "N", &nr, nrhs, &nr, &c_b1034, &u[nrf + u_dim1], ldu, &
+		rwork[(nr * *nrhs << 1) + 1], &nr, &c_b328, &rwork[1], &nr);
+	j = nr * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nr - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L90: */
+	    }
+/* L100: */
+	}
+	sgemm_("T", "N", &nr, nrhs, &nr, &c_b1034, &u[nrf + u_dim1], ldu, &
+		rwork[(nr * *nrhs << 1) + 1], &nr, &c_b328, &rwork[nr * *nrhs
+		+ 1], &nr);
+	jreal = 0;
+	jimag = nr * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nr - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		bx[i__4].r = q__1.r, bx[i__4].i = q__1.i;
+/* L110: */
+	    }
+/* L120: */
+	}
+
+/* L130: */
+    }
+
+/*
+       Next copy the rows of B that correspond to unchanged rows
+       in the bidiagonal matrix to BX.
+*/
+
+    i__1 = nd;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	ic = iwork[inode + i__ - 1];
+	ccopy_(nrhs, &b[ic + b_dim1], ldb, &bx[ic + bx_dim1], ldbx);
+/* L140: */
+    }
+
+/*
+       Finally go through the left singular vector matrices of all
+       the other subproblems bottom-up on the tree.
+*/
+
+    j = pow_ii(&c__2, &nlvl);
+    sqre = 0;
+
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          find the first node LF and last node LL on
+          the current level LVL
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    --j;
+	    clals0_(icompq, &nl, &nr, &sqre, nrhs, &bx[nlf + bx_dim1], ldbx, &
+		    b[nlf + b_dim1], ldb, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &rwork[1], info);
+/* L150: */
+	}
+/* L160: */
+    }
+    goto L330;
+
+/*     ICOMPQ = 1: applying back the right singular vector factors. */
+
+L170:
+
+/*
+       First now go through the right singular vector matrices of all
+       the tree nodes top-down.
+*/
+
+    j = 0;
+    i__1 = nlvl;
+    for (lvl = 1; lvl <= i__1; ++lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          Find the first node LF and last node LL on
+          the current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__2 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__2);
+	    ll = (lf << 1) - 1;
+	}
+	i__2 = lf;
+	for (i__ = ll; i__ >= i__2; --i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    if (i__ == ll) {
+		sqre = 0;
+	    } else {
+		sqre = 1;
+	    }
+	    ++j;
+	    clals0_(icompq, &nl, &nr, &sqre, nrhs, &b[nlf + b_dim1], ldb, &bx[
+		    nlf + bx_dim1], ldbx, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &rwork[1], info);
+/* L180: */
+	}
+/* L190: */
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by SLASDQ. The corresponding right singular vector
+       matrices are in explicit form. Apply them back.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlp1 = nl + 1;
+	if (i__ == nd) {
+	    nrp1 = nr;
+	} else {
+	    nrp1 = nr + 1;
+	}
+	nlf = ic - nl;
+	nrf = ic + 1;
+
+/*
+          Since B and BX are complex, the following call to SGEMM is
+          performed in two steps (real and imaginary parts).
+
+          CALL SGEMM( 'T', 'N', NLP1, NRHS, NLP1, ONE, VT( NLF, 1 ), LDU,
+      $               B( NLF, 1 ), LDB, ZERO, BX( NLF, 1 ), LDBX )
+*/
+
+	j = nlp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nlp1 - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L200: */
+	    }
+/* L210: */
+	}
+	sgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b1034, &vt[nlf + vt_dim1],
+		ldu, &rwork[(nlp1 * *nrhs << 1) + 1], &nlp1, &c_b328, &rwork[
+		1], &nlp1);
+	j = nlp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nlp1 - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L220: */
+	    }
+/* L230: */
+	}
+	sgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b1034, &vt[nlf + vt_dim1],
+		ldu, &rwork[(nlp1 * *nrhs << 1) + 1], &nlp1, &c_b328, &rwork[
+		nlp1 * *nrhs + 1], &nlp1);
+	jreal = 0;
+	jimag = nlp1 * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nlf + nlp1 - 1;
+	    for (jrow = nlf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		bx[i__4].r = q__1.r, bx[i__4].i = q__1.i;
+/* L240: */
+	    }
+/* L250: */
+	}
+
+/*
+          Since B and BX are complex, the following call to SGEMM is
+          performed in two steps (real and imaginary parts).
+
+          CALL SGEMM( 'T', 'N', NRP1, NRHS, NRP1, ONE, VT( NRF, 1 ), LDU,
+      $               B( NRF, 1 ), LDB, ZERO, BX( NRF, 1 ), LDBX )
+*/
+
+	j = nrp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nrp1 - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		i__4 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__4].r;
+/* L260: */
+	    }
+/* L270: */
+	}
+	sgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b1034, &vt[nrf + vt_dim1],
+		ldu, &rwork[(nrp1 * *nrhs << 1) + 1], &nrp1, &c_b328, &rwork[
+		1], &nrp1);
+	j = nrp1 * *nrhs << 1;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nrp1 - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L280: */
+	    }
+/* L290: */
+	}
+	sgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b1034, &vt[nrf + vt_dim1],
+		ldu, &rwork[(nrp1 * *nrhs << 1) + 1], &nrp1, &c_b328, &rwork[
+		nrp1 * *nrhs + 1], &nrp1);
+	jreal = 0;
+	jimag = nrp1 * *nrhs;
+	i__2 = *nrhs;
+	for (jcol = 1; jcol <= i__2; ++jcol) {
+	    i__3 = nrf + nrp1 - 1;
+	    for (jrow = nrf; jrow <= i__3; ++jrow) {
+		++jreal;
+		++jimag;
+		i__4 = jrow + jcol * bx_dim1;
+		i__5 = jreal;
+		i__6 = jimag;
+		q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		bx[i__4].r = q__1.r, bx[i__4].i = q__1.i;
+/* L300: */
+	    }
+/* L310: */
+	}
+
+/* L320: */
+    }
+
+L330:
+
+    return 0;
+
+/*     End of CLALSA */
+
+} /* clalsa_ */
+
+/* Subroutine */ int clalsd_(char *uplo, integer *smlsiz, integer *n, integer
+	*nrhs, real *d__, real *e, complex *b, integer *ldb, real *rcond,
+	integer *rank, complex *work, real *rwork, integer *iwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, i__6;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    static integer c__, i__, j, k;
+    static real r__;
+    static integer s, u, z__;
+    static real cs;
+    static integer bx;
+    static real sn;
+    static integer st, vt, nm1, st1;
+    static real eps;
+    static integer iwk;
+    static real tol;
+    static integer difl, difr;
+    static real rcnd;
+    static integer jcol, irwb, perm, nsub, nlvl, sqre, bxst, jrow, irwu,
+	    jimag, jreal;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer irwib;
+    extern /* Subroutine */ int ccopy_(integer *, complex *, integer *,
+	    complex *, integer *);
+    static integer poles, sizei, irwrb, nsize;
+    extern /* Subroutine */ int csrot_(integer *, complex *, integer *,
+	    complex *, integer *, real *, real *);
+    static integer irwvt, icmpq1, icmpq2;
+    extern /* Subroutine */ int clalsa_(integer *, integer *, integer *,
+	    integer *, complex *, integer *, complex *, integer *, real *,
+	    integer *, real *, integer *, real *, real *, real *, real *,
+	    integer *, integer *, integer *, integer *, real *, real *, real *
+	    , real *, integer *, integer *), clascl_(char *, integer *,
+	    integer *, real *, real *, integer *, integer *, complex *,
+	    integer *, integer *);
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int slasda_(integer *, integer *, integer *,
+	    integer *, real *, real *, real *, integer *, real *, integer *,
+	    real *, real *, real *, real *, integer *, integer *, integer *,
+	    integer *, real *, real *, real *, real *, integer *, integer *),
+	    clacpy_(char *, integer *, integer *, complex *, integer *,
+	    complex *, integer *), claset_(char *, integer *, integer
+	    *, complex *, complex *, complex *, integer *), xerbla_(
+	    char *, integer *), slascl_(char *, integer *, integer *,
+	    real *, real *, integer *, integer *, real *, integer *, integer *
+	    );
+    extern integer isamax_(integer *, real *, integer *);
+    static integer givcol;
+    extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer
+	    *, integer *, integer *, real *, real *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, integer *),
+	    slaset_(char *, integer *, integer *, real *, real *, real *,
+	    integer *), slartg_(real *, real *, real *, real *, real *
+	    );
+    static real orgnrm;
+    static integer givnum;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *);
+    static integer givptr, nrwork, irwwrk, smlszp;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    CLALSD uses the singular value decomposition of A to solve the least
+    squares problem of finding X to minimize the Euclidean norm of each
+    column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+    are N-by-NRHS. The solution X overwrites B.
+
+    The singular values of A smaller than RCOND times the largest
+    singular value are treated as zero in solving the least squares
+    problem; in this case a minimum norm solution is returned.
+    The actual singular values are returned in D in ascending order.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    UPLO   (input) CHARACTER*1
+           = 'U': D and E define an upper bidiagonal matrix.
+           = 'L': D and E define a  lower bidiagonal matrix.
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The dimension of the  bidiagonal matrix.  N >= 0.
+
+    NRHS   (input) INTEGER
+           The number of columns of B. NRHS must be at least 1.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry D contains the main diagonal of the bidiagonal
+           matrix. On exit, if INFO = 0, D contains its singular values.
+
+    E      (input/output) REAL array, dimension (N-1)
+           Contains the super-diagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    B      (input/output) COMPLEX array, dimension (LDB,NRHS)
+           On input, B contains the right hand sides of the least
+           squares problem. On output, B contains the solution X.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,N).
+
+    RCOND  (input) REAL
+           The singular values of A less than or equal to RCOND times
+           the largest singular value are treated as zero in solving
+           the least squares problem. If RCOND is negative,
+           machine precision is used instead.
+           For example, if diag(S)*X=B were the least squares problem,
+           where diag(S) is a diagonal matrix of singular values, the
+           solution would be X(i) = B(i) / S(i) if S(i) is greater than
+           RCOND*max(S), and X(i) = 0 if S(i) is less than or equal to
+           RCOND*max(S).
+
+    RANK   (output) INTEGER
+           The number of singular values of A greater than RCOND times
+           the largest singular value.
+
+    WORK   (workspace) COMPLEX array, dimension (N * NRHS).
+
+    RWORK  (workspace) REAL array, dimension at least
+           (9*N + 2*N*SMLSIZ + 8*N*NLVL + 3*SMLSIZ*NRHS +
+           MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS ),
+           where
+           NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 )
+
+    IWORK  (workspace) INTEGER array, dimension (3*N*NLVL + 11*N).
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+           > 0:  The algorithm failed to compute a singular value while
+                 working on the submatrix lying in rows and columns
+                 INFO/(N+1) through MOD(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < 1 || *ldb < *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CLALSD", &i__1);
+	return 0;
+    }
+
+    eps = slamch_("Epsilon");
+
+/*     Set up the tolerance. */
+
+    if (*rcond <= 0.f || *rcond >= 1.f) {
+	rcnd = eps;
+    } else {
+	rcnd = *rcond;
+    }
+
+    *rank = 0;
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+	if (d__[1] == 0.f) {
+	    claset_("A", &c__1, nrhs, &c_b56, &c_b56, &b[b_offset], ldb);
+	} else {
+	    *rank = 1;
+	    clascl_("G", &c__0, &c__0, &d__[1], &c_b1034, &c__1, nrhs, &b[
+		    b_offset], ldb, info);
+	    d__[1] = dabs(d__[1]);
+	}
+	return 0;
+    }
+
+/*     Rotate the matrix if it is lower bidiagonal. */
+
+    if (*(unsigned char *)uplo == 'L') {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    slartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (*nrhs == 1) {
+		csrot_(&c__1, &b[i__ + b_dim1], &c__1, &b[i__ + 1 + b_dim1], &
+			c__1, &cs, &sn);
+	    } else {
+		rwork[(i__ << 1) - 1] = cs;
+		rwork[i__ * 2] = sn;
+	    }
+/* L10: */
+	}
+	if (*nrhs > 1) {
+	    i__1 = *nrhs;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = *n - 1;
+		for (j = 1; j <= i__2; ++j) {
+		    cs = rwork[(j << 1) - 1];
+		    sn = rwork[j * 2];
+		    csrot_(&c__1, &b[j + i__ * b_dim1], &c__1, &b[j + 1 + i__
+			    * b_dim1], &c__1, &cs, &sn);
+/* L20: */
+		}
+/* L30: */
+	    }
+	}
+    }
+
+/*     Scale. */
+
+    nm1 = *n - 1;
+    orgnrm = slanst_("M", n, &d__[1], &e[1]);
+    if (orgnrm == 0.f) {
+	claset_("A", n, nrhs, &c_b56, &c_b56, &b[b_offset], ldb);
+	return 0;
+    }
+
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, &c__1, &d__[1], n, info);
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &nm1, &c__1, &e[1], &nm1,
+	    info);
+
+/*
+       If N is smaller than the minimum divide size SMLSIZ, then solve
+       the problem with another solver.
+*/
+
+    if (*n <= *smlsiz) {
+	irwu = 1;
+	irwvt = irwu + *n * *n;
+	irwwrk = irwvt + *n * *n;
+	irwrb = irwwrk;
+	irwib = irwrb + *n * *nrhs;
+	irwb = irwib + *n * *nrhs;
+	slaset_("A", n, n, &c_b328, &c_b1034, &rwork[irwu], n);
+	slaset_("A", n, n, &c_b328, &c_b1034, &rwork[irwvt], n);
+	slasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &rwork[irwvt], n,
+		&rwork[irwu], n, &rwork[irwwrk], &c__1, &rwork[irwwrk], info);
+	if (*info != 0) {
+	    return 0;
+	}
+
+/*
+          In the real version, B is passed to SLASDQ and multiplied
+          internally by Q'. Here B is complex and that product is
+          computed below in two steps (real and imaginary parts).
+*/
+
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		i__3 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__3].r;
+/* L40: */
+	    }
+/* L50: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwu], n, &rwork[irwb],
+		n, &c_b328, &rwork[irwrb], n);
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L60: */
+	    }
+/* L70: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwu], n, &rwork[irwb],
+		n, &c_b328, &rwork[irwib], n);
+	jreal = irwrb - 1;
+	jimag = irwib - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++jreal;
+		++jimag;
+		i__3 = jrow + jcol * b_dim1;
+		i__4 = jreal;
+		i__5 = jimag;
+		q__1.r = rwork[i__4], q__1.i = rwork[i__5];
+		b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L80: */
+	    }
+/* L90: */
+	}
+
+	tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1));
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (d__[i__] <= tol) {
+		claset_("A", &c__1, nrhs, &c_b56, &c_b56, &b[i__ + b_dim1],
+			ldb);
+	    } else {
+		clascl_("G", &c__0, &c__0, &d__[i__], &c_b1034, &c__1, nrhs, &
+			b[i__ + b_dim1], ldb, info);
+		++(*rank);
+	    }
+/* L100: */
+	}
+
+/*
+          Since B is complex, the following call to SGEMM is performed
+          in two steps (real and imaginary parts). That is for V * B
+          (in the real version of the code V' is stored in WORK).
+
+          CALL SGEMM( 'T', 'N', N, NRHS, N, ONE, WORK, N, B, LDB, ZERO,
+      $               WORK( NWORK ), N )
+*/
+
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		i__3 = jrow + jcol * b_dim1;
+		rwork[j] = b[i__3].r;
+/* L110: */
+	    }
+/* L120: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwvt], n, &rwork[irwb],
+		 n, &c_b328, &rwork[irwrb], n);
+	j = irwb - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++j;
+		rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L130: */
+	    }
+/* L140: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwvt], n, &rwork[irwb],
+		 n, &c_b328, &rwork[irwib], n);
+	jreal = irwrb - 1;
+	jimag = irwib - 1;
+	i__1 = *nrhs;
+	for (jcol = 1; jcol <= i__1; ++jcol) {
+	    i__2 = *n;
+	    for (jrow = 1; jrow <= i__2; ++jrow) {
+		++jreal;
+		++jimag;
+		i__3 = jrow + jcol * b_dim1;
+		i__4 = jreal;
+		i__5 = jimag;
+		q__1.r = rwork[i__4], q__1.i = rwork[i__5];
+		b[i__3].r = q__1.r, b[i__3].i = q__1.i;
+/* L150: */
+	    }
+/* L160: */
+	}
+
+/*        Unscale. */
+
+	slascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, n, &c__1, &d__[1], n,
+		info);
+	slasrt_("D", n, &d__[1], info);
+	clascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, nrhs, &b[b_offset],
+		ldb, info);
+
+	return 0;
+    }
+
+/*     Book-keeping and setting up some constants. */
+
+    nlvl = (integer) (log((real) (*n) / (real) (*smlsiz + 1)) / log(2.f)) + 1;
+
+    smlszp = *smlsiz + 1;
+
+    u = 1;
+    vt = *smlsiz * *n + 1;
+    difl = vt + smlszp * *n;
+    difr = difl + nlvl * *n;
+    z__ = difr + (nlvl * *n << 1);
+    c__ = z__ + nlvl * *n;
+    s = c__ + *n;
+    poles = s + *n;
+    givnum = poles + (nlvl << 1) * *n;
+    nrwork = givnum + (nlvl << 1) * *n;
+    bx = 1;
+
+    irwrb = nrwork;
+    irwib = irwrb + *smlsiz * *nrhs;
+    irwb = irwib + *smlsiz * *nrhs;
+
+    sizei = *n + 1;
+    k = sizei + *n;
+    givptr = k + *n;
+    perm = givptr + *n;
+    givcol = perm + nlvl * *n;
+    iwk = givcol + (nlvl * *n << 1);
+
+    st = 1;
+    sqre = 0;
+    icmpq1 = 1;
+    icmpq2 = 0;
+    nsub = 0;
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = d__[i__], dabs(r__1)) < eps) {
+	    d__[i__] = r_sign(&eps, &d__[i__]);
+	}
+/* L170: */
+    }
+
+    i__1 = nm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = e[i__], dabs(r__1)) < eps || i__ == nm1) {
+	    ++nsub;
+	    iwork[nsub] = st;
+
+/*
+             Subproblem found. First determine its size and then
+             apply divide and conquer on it.
+*/
+
+	    if (i__ < nm1) {
+
+/*              A subproblem with E(I) small for I < NM1. */
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else if ((r__1 = e[i__], dabs(r__1)) >= eps) {
+
+/*              A subproblem with E(NM1) not too small but I = NM1. */
+
+		nsize = *n - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else {
+
+/*
+                A subproblem with E(NM1) small. This implies an
+                1-by-1 subproblem at D(N), which is not solved
+                explicitly.
+*/
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+		++nsub;
+		iwork[nsub] = *n;
+		iwork[sizei + nsub - 1] = 1;
+		ccopy_(nrhs, &b[*n + b_dim1], ldb, &work[bx + nm1], n);
+	    }
+	    st1 = st - 1;
+	    if (nsize == 1) {
+
+/*
+                This is a 1-by-1 subproblem and is not solved
+                explicitly.
+*/
+
+		ccopy_(nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n);
+	    } else if (nsize <= *smlsiz) {
+
+/*              This is a small subproblem and is solved by SLASDQ. */
+
+		slaset_("A", &nsize, &nsize, &c_b328, &c_b1034, &rwork[vt +
+			st1], n);
+		slaset_("A", &nsize, &nsize, &c_b328, &c_b1034, &rwork[u +
+			st1], n);
+		slasdq_("U", &c__0, &nsize, &nsize, &nsize, &c__0, &d__[st], &
+			e[st], &rwork[vt + st1], n, &rwork[u + st1], n, &
+			rwork[nrwork], &c__1, &rwork[nrwork], info)
+			;
+		if (*info != 0) {
+		    return 0;
+		}
+
+/*
+                In the real version, B is passed to SLASDQ and multiplied
+                internally by Q'. Here B is complex and that product is
+                computed below in two steps (real and imaginary parts).
+*/
+
+		j = irwb - 1;
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = st + nsize - 1;
+		    for (jrow = st; jrow <= i__3; ++jrow) {
+			++j;
+			i__4 = jrow + jcol * b_dim1;
+			rwork[j] = b[i__4].r;
+/* L180: */
+		    }
+/* L190: */
+		}
+		sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[u +
+			st1], n, &rwork[irwb], &nsize, &c_b328, &rwork[irwrb],
+			 &nsize);
+		j = irwb - 1;
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = st + nsize - 1;
+		    for (jrow = st; jrow <= i__3; ++jrow) {
+			++j;
+			rwork[j] = r_imag(&b[jrow + jcol * b_dim1]);
+/* L200: */
+		    }
+/* L210: */
+		}
+		sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[u +
+			st1], n, &rwork[irwb], &nsize, &c_b328, &rwork[irwib],
+			 &nsize);
+		jreal = irwrb - 1;
+		jimag = irwib - 1;
+		i__2 = *nrhs;
+		for (jcol = 1; jcol <= i__2; ++jcol) {
+		    i__3 = st + nsize - 1;
+		    for (jrow = st; jrow <= i__3; ++jrow) {
+			++jreal;
+			++jimag;
+			i__4 = jrow + jcol * b_dim1;
+			i__5 = jreal;
+			i__6 = jimag;
+			q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+			b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L220: */
+		    }
+/* L230: */
+		}
+
+		clacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx +
+			st1], n);
+	    } else {
+
+/*              A large problem. Solve it using divide and conquer. */
+
+		slasda_(&icmpq1, smlsiz, &nsize, &sqre, &d__[st], &e[st], &
+			rwork[u + st1], n, &rwork[vt + st1], &iwork[k + st1],
+			&rwork[difl + st1], &rwork[difr + st1], &rwork[z__ +
+			st1], &rwork[poles + st1], &iwork[givptr + st1], &
+			iwork[givcol + st1], n, &iwork[perm + st1], &rwork[
+			givnum + st1], &rwork[c__ + st1], &rwork[s + st1], &
+			rwork[nrwork], &iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+		bxst = bx + st1;
+		clalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, &
+			work[bxst], n, &rwork[u + st1], n, &rwork[vt + st1], &
+			iwork[k + st1], &rwork[difl + st1], &rwork[difr + st1]
+			, &rwork[z__ + st1], &rwork[poles + st1], &iwork[
+			givptr + st1], &iwork[givcol + st1], n, &iwork[perm +
+			st1], &rwork[givnum + st1], &rwork[c__ + st1], &rwork[
+			s + st1], &rwork[nrwork], &iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+	    }
+	    st = i__ + 1;
+	}
+/* L240: */
+    }
+
+/*     Apply the singular values and treat the tiny ones as zero. */
+
+    tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1));
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+          Some of the elements in D can be negative because 1-by-1
+          subproblems were not solved explicitly.
+*/
+
+	if ((r__1 = d__[i__], dabs(r__1)) <= tol) {
+	    claset_("A", &c__1, nrhs, &c_b56, &c_b56, &work[bx + i__ - 1], n);
+	} else {
+	    ++(*rank);
+	    clascl_("G", &c__0, &c__0, &d__[i__], &c_b1034, &c__1, nrhs, &
+		    work[bx + i__ - 1], n, info);
+	}
+	d__[i__] = (r__1 = d__[i__], dabs(r__1));
+/* L250: */
+    }
+
+/*     Now apply back the right singular vectors. */
+
+    icmpq2 = 1;
+    i__1 = nsub;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	st = iwork[i__];
+	st1 = st - 1;
+	nsize = iwork[sizei + i__ - 1];
+	bxst = bx + st1;
+	if (nsize == 1) {
+	    ccopy_(nrhs, &work[bxst], n, &b[st + b_dim1], ldb);
+	} else if (nsize <= *smlsiz) {
+
+/*
+             Since B and BX are complex, the following call to SGEMM
+             is performed in two steps (real and imaginary parts).
+
+             CALL SGEMM( 'T', 'N', NSIZE, NRHS, NSIZE, ONE,
+      $                  RWORK( VT+ST1 ), N, RWORK( BXST ), N, ZERO,
+      $                  B( ST, 1 ), LDB )
+*/
+
+	    j = bxst - *n - 1;
+	    jreal = irwb - 1;
+	    i__2 = *nrhs;
+	    for (jcol = 1; jcol <= i__2; ++jcol) {
+		j += *n;
+		i__3 = nsize;
+		for (jrow = 1; jrow <= i__3; ++jrow) {
+		    ++jreal;
+		    i__4 = j + jrow;
+		    rwork[jreal] = work[i__4].r;
+/* L260: */
+		}
+/* L270: */
+	    }
+	    sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[vt + st1],
+		     n, &rwork[irwb], &nsize, &c_b328, &rwork[irwrb], &nsize);
+	    j = bxst - *n - 1;
+	    jimag = irwb - 1;
+	    i__2 = *nrhs;
+	    for (jcol = 1; jcol <= i__2; ++jcol) {
+		j += *n;
+		i__3 = nsize;
+		for (jrow = 1; jrow <= i__3; ++jrow) {
+		    ++jimag;
+		    rwork[jimag] = r_imag(&work[j + jrow]);
+/* L280: */
+		}
+/* L290: */
+	    }
+	    sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[vt + st1],
+		     n, &rwork[irwb], &nsize, &c_b328, &rwork[irwib], &nsize);
+	    jreal = irwrb - 1;
+	    jimag = irwib - 1;
+	    i__2 = *nrhs;
+	    for (jcol = 1; jcol <= i__2; ++jcol) {
+		i__3 = st + nsize - 1;
+		for (jrow = st; jrow <= i__3; ++jrow) {
+		    ++jreal;
+		    ++jimag;
+		    i__4 = jrow + jcol * b_dim1;
+		    i__5 = jreal;
+		    i__6 = jimag;
+		    q__1.r = rwork[i__5], q__1.i = rwork[i__6];
+		    b[i__4].r = q__1.r, b[i__4].i = q__1.i;
+/* L300: */
+		}
+/* L310: */
+	    }
+	} else {
+	    clalsa_(&icmpq2, smlsiz, &nsize, nrhs, &work[bxst], n, &b[st +
+		    b_dim1], ldb, &rwork[u + st1], n, &rwork[vt + st1], &
+		    iwork[k + st1], &rwork[difl + st1], &rwork[difr + st1], &
+		    rwork[z__ + st1], &rwork[poles + st1], &iwork[givptr +
+		    st1], &iwork[givcol + st1], n, &iwork[perm + st1], &rwork[
+		    givnum + st1], &rwork[c__ + st1], &rwork[s + st1], &rwork[
+		    nrwork], &iwork[iwk], info);
+	    if (*info != 0) {
+		return 0;
+	    }
+	}
+/* L320: */
+    }
+
+/*     Unscale and sort the singular values. */
+
+    slascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, n, &c__1, &d__[1], n, info);
+    slasrt_("D", n, &d__[1], info);
+    clascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, nrhs, &b[b_offset], ldb,
+	    info);
+
+    return 0;
+
+/*     End of CLALSD */
+
+} /* clalsd_ */
+
 doublereal clange_(char *norm, integer *m, integer *n, complex *a, integer *
 	lda, real *work)
 {
@@ -15487,8 +18145,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
     }
 
     l = *m * *n + 1;
-    sgemm_("N", "N", m, n, m, &c_b894, &a[a_offset], lda, &rwork[1], m, &
-	    c_b1087, &rwork[l], m);
+    sgemm_("N", "N", m, n, m, &c_b1034, &a[a_offset], lda, &rwork[1], m, &
+	    c_b328, &rwork[l], m);
     i__1 = *n;
     for (j = 1; j <= i__1; ++j) {
 	i__2 = *m;
@@ -15510,8 +18168,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 	}
 /* L60: */
     }
-    sgemm_("N", "N", m, n, m, &c_b894, &a[a_offset], lda, &rwork[1], m, &
-	    c_b1087, &rwork[l], m);
+    sgemm_("N", "N", m, n, m, &c_b1034, &a[a_offset], lda, &rwork[1], m, &
+	    c_b328, &rwork[l], m);
     i__1 = *n;
     for (j = 1; j <= i__1; ++j) {
 	i__2 = *m;
@@ -19708,7 +22366,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 
 /*                 Scale x by 1/2. */
 
-		    csscal_(n, &c_b2023, &x[1], &c__1);
+		    csscal_(n, &c_b2435, &x[1], &c__1);
 		    *scale *= .5f;
 		}
 
@@ -20515,8 +23173,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 			    a[i__ + (i__ + ib) * a_dim1], lda, &c_b57, &a[i__
 			    * a_dim1 + 1], lda);
 		    i__3 = *n - i__ - ib + 1;
-		    cherk_("Upper", "No transpose", &ib, &i__3, &c_b894, &a[
-			    i__ + (i__ + ib) * a_dim1], lda, &c_b894, &a[i__
+		    cherk_("Upper", "No transpose", &ib, &i__3, &c_b1034, &a[
+			    i__ + (i__ + ib) * a_dim1], lda, &c_b1034, &a[i__
 			    + i__ * a_dim1], lda);
 		}
 /* L10: */
@@ -20545,8 +23203,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 			    a_dim1], lda);
 		    i__3 = *n - i__ - ib + 1;
 		    cherk_("Lower", "Conjugate transpose", &ib, &i__3, &
-			    c_b894, &a[i__ + ib + i__ * a_dim1], lda, &c_b894,
-			     &a[i__ + i__ * a_dim1], lda);
+			    c_b1034, &a[i__ + ib + i__ * a_dim1], lda, &
+			    c_b1034, &a[i__ + i__ * a_dim1], lda);
 		}
 /* L20: */
 	    }
@@ -20910,8 +23568,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 		i__3 = nb, i__4 = *n - j + 1;
 		jb = min(i__3,i__4);
 		i__3 = j - 1;
-		cherk_("Upper", "Conjugate transpose", &jb, &i__3, &c_b1136, &
-			a[j * a_dim1 + 1], lda, &c_b894, &a[j + j * a_dim1],
+		cherk_("Upper", "Conjugate transpose", &jb, &i__3, &c_b1276, &
+			a[j * a_dim1 + 1], lda, &c_b1034, &a[j + j * a_dim1],
 			lda);
 		cpotf2_("Upper", &jb, &a[j + j * a_dim1], lda, info);
 		if (*info != 0) {
@@ -20953,8 +23611,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 		i__3 = nb, i__4 = *n - j + 1;
 		jb = min(i__3,i__4);
 		i__3 = j - 1;
-		cherk_("Lower", "No transpose", &jb, &i__3, &c_b1136, &a[j +
-			a_dim1], lda, &c_b894, &a[j + j * a_dim1], lda);
+		cherk_("Lower", "No transpose", &jb, &i__3, &c_b1276, &a[j +
+			a_dim1], lda, &c_b1034, &a[j + j * a_dim1], lda);
 		cpotf2_("Lower", &jb, &a[j + j * a_dim1], lda, info);
 		if (*info != 0) {
 		    goto L30;
@@ -21681,7 +24339,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 /*        If COMPZ = 'I', we simply call SSTEDC instead. */
 
 	if (icompz == 2) {
-	    slaset_("Full", n, n, &c_b1087, &c_b894, &rwork[1], n);
+	    slaset_("Full", n, n, &c_b328, &c_b1034, &rwork[1], n);
 	    ll = *n * *n + 1;
 	    i__1 = *lrwork - ll + 1;
 	    sstedc_("I", n, &d__[1], &e[1], &rwork[1], n, &rwork[ll], &i__1, &
@@ -21748,12 +24406,12 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 /*              Scale. */
 
 		orgnrm = slanst_("M", &m, &d__[start], &e[start]);
-		slascl_("G", &c__0, &c__0, &orgnrm, &c_b894, &m, &c__1, &d__[
+		slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &m, &c__1, &d__[
 			start], &m, info);
 		i__1 = m - 1;
 		i__2 = m - 1;
-		slascl_("G", &c__0, &c__0, &orgnrm, &c_b894, &i__1, &c__1, &e[
-			start], &i__2, info);
+		slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &i__1, &c__1, &
+			e[start], &i__2, info);
 
 		claed0_(n, &m, &d__[start], &e[start], &z__[start * z_dim1 +
 			1], ldz, &work[1], n, &rwork[1], &iwork[1], info);
@@ -21765,7 +24423,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 
 /*              Scale back. */
 
-		slascl_("G", &c__0, &c__0, &c_b894, &orgnrm, &m, &c__1, &d__[
+		slascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, &m, &c__1, &d__[
 			start], &m, info);
 
 	    } else {
@@ -22166,7 +24824,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 /*        Form shift. */
 
 	g = (d__[l + 1] - p) / (e[l] * 2.f);
-	r__ = slapy2_(&g, &c_b894);
+	r__ = slapy2_(&g, &c_b1034);
 	g = d__[m] - p + e[l] / (g + r_sign(&r__, &g));
 
 	s = 1.f;
@@ -22292,7 +24950,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer *
 /*        Form shift. */
 
 	g = (d__[l - 1] - p) / (e[l - 1] * 2.f);
-	r__ = slapy2_(&g, &c_b894);
+	r__ = slapy2_(&g, &c_b1034);
 	g = d__[m] - p + e[l - 1] / (g + r_sign(&r__, &g));
 
 	s = 1.f;
diff --git a/numpy/linalg/lapack_lite/f2c_s_lapack.c b/numpy/linalg/lapack_lite/f2c_s_lapack.c
index 04080f81da97..fccb1f58b553 100644
--- a/numpy/linalg/lapack_lite/f2c_s_lapack.c
+++ b/numpy/linalg/lapack_lite/f2c_s_lapack.c
@@ -40,6 +40,7 @@ static integer c_n1 = -1;
 static integer c__3 = 3;
 static integer c__2 = 2;
 static integer c__65 = 65;
+static integer c__6 = 6;
 static integer c__12 = 12;
 static integer c__49 = 49;
 static integer c__4 = 4;
@@ -49,7 +50,7 @@ static integer c__15 = 15;
 static integer c__14 = 14;
 static integer c__16 = 16;
 static logical c_true = TRUE_;
-static real c_b2863 = 2.f;
+static real c_b3178 = 2.f;
 
 /* Subroutine */ int sbdsdc_(char *uplo, char *compq, integer *n, real *d__,
 	real *e, real *u, integer *ldu, real *vt, integer *ldvt, real *q,
@@ -4025,189 +4026,754 @@ static real c_b2863 = 2.f;
 
 } /* sgelqf_ */
 
-/* Subroutine */ int sgeqr2_(integer *m, integer *n, real *a, integer *lda,
-	real *tau, real *work, integer *info)
+/* Subroutine */ int sgelsd_(integer *m, integer *n, integer *nrhs, real *a,
+	integer *lda, real *b, integer *ldb, real *s, real *rcond, integer *
+	rank, real *work, integer *lwork, integer *iwork, integer *info)
 {
     /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3;
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4;
 
     /* Local variables */
-    static integer i__, k;
-    static real aii;
-    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
-	    integer *, real *, real *, integer *, real *), xerbla_(
-	    char *, integer *), slarfg_(integer *, real *, real *,
-	    integer *, real *);
+    static integer ie, il, mm;
+    static real eps, anrm, bnrm;
+    static integer itau, nlvl, iascl, ibscl;
+    static real sfmin;
+    static integer minmn, maxmn, itaup, itauq, mnthr, nwork;
+    extern /* Subroutine */ int slabad_(real *, real *), sgebrd_(integer *,
+	    integer *, real *, integer *, real *, real *, real *, real *,
+	    real *, integer *, integer *);
+    extern doublereal slamch_(char *), slange_(char *, integer *,
+	    integer *, real *, integer *, real *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    static real bignum;
+    extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer
+	    *, real *, real *, integer *, integer *), slalsd_(char *, integer
+	    *, integer *, integer *, real *, real *, real *, integer *, real *
+	    , integer *, real *, integer *, integer *), slascl_(char *
+	    , integer *, integer *, real *, real *, integer *, integer *,
+	    real *, integer *, integer *);
+    static integer wlalsd;
+    extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer
+	    *, real *, real *, integer *, integer *), slacpy_(char *, integer
+	    *, integer *, real *, integer *, real *, integer *),
+	    slaset_(char *, integer *, integer *, real *, real *, real *,
+	    integer *);
+    static integer ldwork;
+    extern /* Subroutine */ int sormbr_(char *, char *, char *, integer *,
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , real *, integer *, integer *);
+    static integer liwork, minwrk, maxwrk;
+    static real smlnum;
+    extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
+    static logical lquery;
+    static integer smlsiz;
+    extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *,
+	    integer *, real *, integer *, real *, real *, integer *, real *,
+	    integer *, integer *);
 
 
 /*
-    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK driver routine (version 3.2) --
     -- LAPACK is a software package provided by Univ. of Tennessee,    --
     -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-       June 2010
+       November 2006
 
 
     Purpose
     =======
 
-    SGEQR2 computes a QR factorization of a real m by n matrix A:
-    A = Q * R.
+    SGELSD computes the minimum-norm solution to a real linear least
+    squares problem:
+        minimize 2-norm(| b - A*x |)
+    using the singular value decomposition (SVD) of A. A is an M-by-N
+    matrix which may be rank-deficient.
+
+    Several right hand side vectors b and solution vectors x can be
+    handled in a single call; they are stored as the columns of the
+    M-by-NRHS right hand side matrix B and the N-by-NRHS solution
+    matrix X.
+
+    The problem is solved in three steps:
+    (1) Reduce the coefficient matrix A to bidiagonal form with
+        Householder transformations, reducing the original problem
+        into a "bidiagonal least squares problem" (BLS)
+    (2) Solve the BLS using a divide and conquer approach.
+    (3) Apply back all the Householder tranformations to solve
+        the original least squares problem.
+
+    The effective rank of A is determined by treating as zero those
+    singular values which are less than RCOND times the largest singular
+    value.
+
+    The divide and conquer algorithm makes very mild assumptions about
+    floating point arithmetic. It will work on machines with a guard
+    digit in add/subtract, or on those binary machines without guard
+    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
+    Cray-2. It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
 
     Arguments
     =========
 
     M       (input) INTEGER
-            The number of rows of the matrix A.  M >= 0.
+            The number of rows of A. M >= 0.
 
     N       (input) INTEGER
-            The number of columns of the matrix A.  N >= 0.
+            The number of columns of A. N >= 0.
 
-    A       (input/output) REAL array, dimension (LDA,N)
-            On entry, the m by n matrix A.
-            On exit, the elements on and above the diagonal of the array
-            contain the min(m,n) by n upper trapezoidal matrix R (R is
-            upper triangular if m >= n); the elements below the diagonal,
-            with the array TAU, represent the orthogonal matrix Q as a
-            product of elementary reflectors (see Further Details).
+    NRHS    (input) INTEGER
+            The number of right hand sides, i.e., the number of columns
+            of the matrices B and X. NRHS >= 0.
+
+    A       (input) REAL array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, A has been destroyed.
 
     LDA     (input) INTEGER
             The leading dimension of the array A.  LDA >= max(1,M).
 
-    TAU     (output) REAL array, dimension (min(M,N))
-            The scalar factors of the elementary reflectors (see Further
-            Details).
+    B       (input/output) REAL array, dimension (LDB,NRHS)
+            On entry, the M-by-NRHS right hand side matrix B.
+            On exit, B is overwritten by the N-by-NRHS solution
+            matrix X.  If m >= n and RANK = n, the residual
+            sum-of-squares for the solution in the i-th column is given
+            by the sum of squares of elements n+1:m in that column.
 
-    WORK    (workspace) REAL array, dimension (N)
+    LDB     (input) INTEGER
+            The leading dimension of the array B. LDB >= max(1,max(M,N)).
 
-    INFO    (output) INTEGER
-            = 0: successful exit
-            < 0: if INFO = -i, the i-th argument had an illegal value
+    S       (output) REAL array, dimension (min(M,N))
+            The singular values of A in decreasing order.
+            The condition number of A in the 2-norm = S(1)/S(min(m,n)).
 
-    Further Details
-    ===============
+    RCOND   (input) REAL
+            RCOND is used to determine the effective rank of A.
+            Singular values S(i) <= RCOND*S(1) are treated as zero.
+            If RCOND < 0, machine precision is used instead.
 
-    The matrix Q is represented as a product of elementary reflectors
+    RANK    (output) INTEGER
+            The effective rank of A, i.e., the number of singular values
+            which are greater than RCOND*S(1).
 
-       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 
-    Each H(i) has the form
+    LWORK   (input) INTEGER
+            The dimension of the array WORK. LWORK must be at least 1.
+            The exact minimum amount of workspace needed depends on M,
+            N and NRHS. As long as LWORK is at least
+                12*N + 2*N*SMLSIZ + 8*N*NLVL + N*NRHS + (SMLSIZ+1)**2,
+            if M is greater than or equal to N or
+                12*M + 2*M*SMLSIZ + 8*M*NLVL + M*NRHS + (SMLSIZ+1)**2,
+            if M is less than N, the code will execute correctly.
+            SMLSIZ is returned by ILAENV and is equal to the maximum
+            size of the subproblems at the bottom of the computation
+            tree (usually about 25), and
+               NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 )
+            For good performance, LWORK should generally be larger.
 
-       H(i) = I - tau * v * v'
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the array WORK and the
+            minimum size of the array IWORK, and returns these values as
+            the first entries of the WORK and IWORK arrays, and no error
+            message related to LWORK is issued by XERBLA.
 
-    where tau is a real scalar, and v is a real vector with
-    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
-    and tau in TAU(i).
+    IWORK   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
+            LIWORK >= max(1, 3*MINMN*NLVL + 11*MINMN),
+            where MINMN = MIN( M,N ).
+            On exit, if INFO = 0, IWORK(1) returns the minimum LIWORK.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+            > 0:  the algorithm for computing the SVD failed to converge;
+                  if INFO = i, i off-diagonal elements of an intermediate
+                  bidiagonal form did not converge to zero.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
 
     =====================================================================
 
 
-       Test the input arguments
+       Test the input arguments.
 */
 
     /* Parameter adjustments */
     a_dim1 = *lda;
     a_offset = 1 + a_dim1;
     a -= a_offset;
-    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --s;
     --work;
+    --iwork;
 
     /* Function Body */
     *info = 0;
+    minmn = min(*m,*n);
+    maxmn = max(*m,*n);
+    lquery = *lwork == -1;
     if (*m < 0) {
 	*info = -1;
     } else if (*n < 0) {
 	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
     } else if (*lda < max(1,*m)) {
-	*info = -4;
-    }
-    if (*info != 0) {
-	i__1 = -(*info);
-	xerbla_("SGEQR2", &i__1);
-	return 0;
+	*info = -5;
+    } else if (*ldb < max(1,maxmn)) {
+	*info = -7;
     }
 
-    k = min(*m,*n);
+/*
+       Compute workspace.
+       (Note: Comments in the code beginning "Workspace:" describe the
+       minimal amount of workspace needed at that point in the code,
+       as well as the preferred amount for good performance.
+       NB refers to the optimal block size for the immediately
+       following subroutine, as returned by ILAENV.)
+*/
 
-    i__1 = k;
-    for (i__ = 1; i__ <= i__1; ++i__) {
+    if (*info == 0) {
+	minwrk = 1;
+	maxwrk = 1;
+	liwork = 1;
+	if (minmn > 0) {
+	    smlsiz = ilaenv_(&c__9, "SGELSD", " ", &c__0, &c__0, &c__0, &c__0,
+		     (ftnlen)6, (ftnlen)1);
+	    mnthr = ilaenv_(&c__6, "SGELSD", " ", m, n, nrhs, &c_n1, (ftnlen)
+		    6, (ftnlen)1);
+/* Computing MAX */
+	    i__1 = (integer) (log((real) minmn / (real) (smlsiz + 1)) / log(
+		    2.f)) + 1;
+	    nlvl = max(i__1,0);
+	    liwork = minmn * 3 * nlvl + minmn * 11;
+	    mm = *m;
+	    if (*m >= *n && *m >= mnthr) {
 
-/*        Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
+/*
+                Path 1a - overdetermined, with many more rows than
+                          columns.
+*/
 
-	i__2 = *m - i__ + 1;
-/* Computing MIN */
-	i__3 = i__ + 1;
-	slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1]
-		, &c__1, &tau[i__]);
-	if (i__ < *n) {
+		mm = *n;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + *n * ilaenv_(&c__1, "SGEQRF",
+			" ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n + *nrhs * ilaenv_(&c__1, "SORMQR",
+			"LT", m, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)2);
+		maxwrk = max(i__1,i__2);
+	    }
+	    if (*m >= *n) {
 
-/*           Apply H(i) to A(i:m,i+1:n) from the left */
+/*
+                Path 1 - overdetermined or exactly determined.
 
-	    aii = a[i__ + i__ * a_dim1];
-	    a[i__ + i__ * a_dim1] = 1.f;
-	    i__2 = *m - i__ + 1;
-	    i__3 = *n - i__;
-	    slarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &tau[
-		    i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
-	    a[i__ + i__ * a_dim1] = aii;
+   Computing MAX
+*/
+		i__1 = maxwrk, i__2 = *n * 3 + (mm + *n) * ilaenv_(&c__1,
+			"SGEBRD", " ", &mm, n, &c_n1, &c_n1, (ftnlen)6, (
+			ftnlen)1);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n * 3 + *nrhs * ilaenv_(&c__1, "SORMBR"
+			, "QLT", &mm, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n * 3 + (*n - 1) * ilaenv_(&c__1,
+			"SORMBR", "PLN", n, nrhs, n, &c_n1, (ftnlen)6, (
+			ftnlen)3);
+		maxwrk = max(i__1,i__2);
+/* Computing 2nd power */
+		i__1 = smlsiz + 1;
+		wlalsd = *n * 9 + (*n << 1) * smlsiz + (*n << 3) * nlvl + *n *
+			 *nrhs + i__1 * i__1;
+/* Computing MAX */
+		i__1 = maxwrk, i__2 = *n * 3 + wlalsd;
+		maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		i__1 = *n * 3 + mm, i__2 = *n * 3 + *nrhs, i__1 = max(i__1,
+			i__2), i__2 = *n * 3 + wlalsd;
+		minwrk = max(i__1,i__2);
+	    }
+	    if (*n > *m) {
+/* Computing 2nd power */
+		i__1 = smlsiz + 1;
+		wlalsd = *m * 9 + (*m << 1) * smlsiz + (*m << 3) * nlvl + *m *
+			 *nrhs + i__1 * i__1;
+		if (*n >= mnthr) {
+
+/*
+                   Path 2a - underdetermined, with many more columns
+                             than rows.
+*/
+
+		    maxwrk = *m + *m * ilaenv_(&c__1, "SGELQF", " ", m, n, &
+			    c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m << 1) *
+			    ilaenv_(&c__1, "SGEBRD", " ", m, m, &c_n1, &c_n1,
+			    (ftnlen)6, (ftnlen)1);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *nrhs *
+			    ilaenv_(&c__1, "SORMBR", "QLT", m, nrhs, m, &c_n1,
+			     (ftnlen)6, (ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m - 1) *
+			    ilaenv_(&c__1, "SORMBR", "PLN", m, nrhs, m, &c_n1,
+			     (ftnlen)6, (ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+		    if (*nrhs > 1) {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + *m + *m * *nrhs;
+			maxwrk = max(i__1,i__2);
+		    } else {
+/* Computing MAX */
+			i__1 = maxwrk, i__2 = *m * *m + (*m << 1);
+			maxwrk = max(i__1,i__2);
+		    }
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m + *nrhs * ilaenv_(&c__1, "SORMLQ"
+			    , "LT", n, nrhs, m, &c_n1, (ftnlen)6, (ftnlen)2);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + wlalsd;
+		    maxwrk = max(i__1,i__2);
+/*
+       XXX: Ensure the Path 2a case below is triggered.  The workspace
+       calculation should use queries for all routines eventually.
+   Computing MAX
+   Computing MAX
+*/
+		    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4),
+			    i__3 = max(i__3,*nrhs), i__4 = *n - *m * 3;
+		    i__1 = maxwrk, i__2 = (*m << 2) + *m * *m + max(i__3,i__4)
+			    ;
+		    maxwrk = max(i__1,i__2);
+		} else {
+
+/*                 Path 2 - remaining underdetermined cases. */
+
+		    maxwrk = *m * 3 + (*n + *m) * ilaenv_(&c__1, "SGEBRD",
+			    " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * 3 + *nrhs * ilaenv_(&c__1,
+			    "SORMBR", "QLT", m, nrhs, n, &c_n1, (ftnlen)6, (
+			    ftnlen)3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORM"
+			    "BR", "PLN", n, nrhs, m, &c_n1, (ftnlen)6, (ftnlen)
+			    3);
+		    maxwrk = max(i__1,i__2);
+/* Computing MAX */
+		    i__1 = maxwrk, i__2 = *m * 3 + wlalsd;
+		    maxwrk = max(i__1,i__2);
+		}
+/* Computing MAX */
+		i__1 = *m * 3 + *nrhs, i__2 = *m * 3 + *m, i__1 = max(i__1,
+			i__2), i__2 = *m * 3 + wlalsd;
+		minwrk = max(i__1,i__2);
+	    }
+	}
+	minwrk = min(minwrk,maxwrk);
+	work[1] = (real) maxwrk;
+	iwork[1] = liwork;
+
+	if (*lwork < minwrk && ! lquery) {
+	    *info = -12;
 	}
-/* L10: */
     }
-    return 0;
 
-/*     End of SGEQR2 */
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGELSD", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
 
-} /* sgeqr2_ */
+/*     Quick return if possible. */
 
-/* Subroutine */ int sgeqrf_(integer *m, integer *n, real *a, integer *lda,
-	real *tau, real *work, integer *lwork, integer *info)
-{
-    /* System generated locals */
-    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    if (*m == 0 || *n == 0) {
+	*rank = 0;
+	return 0;
+    }
 
-    /* Local variables */
-    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
-    extern /* Subroutine */ int sgeqr2_(integer *, integer *, real *, integer
-	    *, real *, real *, integer *), slarfb_(char *, char *, char *,
-	    char *, integer *, integer *, integer *, real *, integer *, real *
-	    , integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *);
-    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
-	    integer *, integer *, ftnlen, ftnlen);
-    extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *,
-	    real *, integer *, real *, real *, integer *);
-    static integer ldwork, lwkopt;
-    static logical lquery;
+/*     Get machine parameters. */
 
+    eps = slamch_("P");
+    sfmin = slamch_("S");
+    smlnum = sfmin / eps;
+    bignum = 1.f / smlnum;
+    slabad_(&smlnum, &bignum);
 
-/*
-    -- LAPACK routine (version 3.2) --
-    -- LAPACK is a software package provided by Univ. of Tennessee,    --
-    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-       November 2006
+/*     Scale A if max entry outside range [SMLNUM,BIGNUM]. */
 
+    anrm = slange_("M", m, n, &a[a_offset], lda, &work[1]);
+    iascl = 0;
+    if (anrm > 0.f && anrm < smlnum) {
 
-    Purpose
-    =======
+/*        Scale matrix norm up to SMLNUM. */
 
-    SGEQRF computes a QR factorization of a real M-by-N matrix A:
-    A = Q * R.
+	slascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 1;
+    } else if (anrm > bignum) {
 
-    Arguments
-    =========
+/*        Scale matrix norm down to BIGNUM. */
 
-    M       (input) INTEGER
-            The number of rows of the matrix A.  M >= 0.
+	slascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda,
+		info);
+	iascl = 2;
+    } else if (anrm == 0.f) {
+
+/*        Matrix all zero. Return zero solution. */
+
+	i__1 = max(*m,*n);
+	slaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[b_offset], ldb);
+	slaset_("F", &minmn, &c__1, &c_b29, &c_b29, &s[1], &c__1);
+	*rank = 0;
+	goto L10;
+    }
+
+/*     Scale B if max entry outside range [SMLNUM,BIGNUM]. */
+
+    bnrm = slange_("M", m, nrhs, &b[b_offset], ldb, &work[1]);
+    ibscl = 0;
+    if (bnrm > 0.f && bnrm < smlnum) {
+
+/*        Scale matrix norm up to SMLNUM. */
+
+	slascl_("G", &c__0, &c__0, &bnrm, &smlnum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 1;
+    } else if (bnrm > bignum) {
+
+/*        Scale matrix norm down to BIGNUM. */
+
+	slascl_("G", &c__0, &c__0, &bnrm, &bignum, m, nrhs, &b[b_offset], ldb,
+		 info);
+	ibscl = 2;
+    }
+
+/*     If M < N make sure certain entries of B are zero. */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	slaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     Overdetermined case. */
+
+    if (*m >= *n) {
+
+/*        Path 1 - overdetermined or exactly determined. */
+
+	mm = *m;
+	if (*m >= mnthr) {
+
+/*           Path 1a - overdetermined, with many more rows than columns. */
+
+	    mm = *n;
+	    itau = 1;
+	    nwork = itau + *n;
+
+/*
+             Compute A=Q*R.
+             (Workspace: need 2*N, prefer N+N*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+
+/*
+             Multiply B by transpose(Q).
+             (Workspace: need N+NRHS, prefer N+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sormqr_("L", "T", m, nrhs, n, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+/*           Zero out below R. */
+
+	    if (*n > 1) {
+		i__1 = *n - 1;
+		i__2 = *n - 1;
+		slaset_("L", &i__1, &i__2, &c_b29, &c_b29, &a[a_dim1 + 2],
+			lda);
+	    }
+	}
+
+	ie = 1;
+	itauq = ie + *n;
+	itaup = itauq + *n;
+	nwork = itaup + *n;
+
+/*
+          Bidiagonalize R in A.
+          (Workspace: need 3*N+MM, prefer 3*N+(MM+N)*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	sgebrd_(&mm, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of R.
+          (Workspace: need 3*N+NRHS, prefer 3*N+NRHS*NB)
+*/
+
+	i__1 = *lwork - nwork + 1;
+	sormbr_("Q", "L", "T", &mm, nrhs, n, &a[a_offset], lda, &work[itauq],
+		&b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	slalsd_("U", &smlsiz, n, nrhs, &s[1], &work[ie], &b[b_offset], ldb,
+		rcond, rank, &work[nwork], &iwork[1], info);
+	if (*info != 0) {
+	    goto L10;
+	}
+
+/*        Multiply B by right bidiagonalizing vectors of R. */
+
+	i__1 = *lwork - nwork + 1;
+	sormbr_("P", "L", "N", n, nrhs, n, &a[a_offset], lda, &work[itaup], &
+		b[b_offset], ldb, &work[nwork], &i__1, info);
+
+    } else /* if(complicated condition) */ {
+/* Computing MAX */
+	i__1 = *m, i__2 = (*m << 1) - 4, i__1 = max(i__1,i__2), i__1 = max(
+		i__1,*nrhs), i__2 = *n - *m * 3, i__1 = max(i__1,i__2);
+	if (*n >= mnthr && *lwork >= (*m << 2) + *m * *m + max(i__1,wlalsd)) {
+
+/*
+          Path 2a - underdetermined, with many more columns than rows
+          and sufficient workspace for an efficient algorithm.
+*/
+
+	    ldwork = *m;
+/*
+   Computing MAX
+   Computing MAX
+*/
+	    i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), i__3 =
+		    max(i__3,*nrhs), i__4 = *n - *m * 3;
+	    i__1 = (*m << 2) + *m * *lda + max(i__3,i__4), i__2 = *m * *lda +
+		    *m + *m * *nrhs, i__1 = max(i__1,i__2), i__2 = (*m << 2)
+		    + *m * *lda + wlalsd;
+	    if (*lwork >= max(i__1,i__2)) {
+		ldwork = *lda;
+	    }
+	    itau = 1;
+	    nwork = *m + 1;
+
+/*
+          Compute A=L*Q.
+          (Workspace: need 2*M, prefer M+M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1,
+		     info);
+	    il = nwork;
+
+/*        Copy L to WORK(IL), zeroing out above its diagonal. */
+
+	    slacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwork);
+	    i__1 = *m - 1;
+	    i__2 = *m - 1;
+	    slaset_("U", &i__1, &i__2, &c_b29, &c_b29, &work[il + ldwork], &
+		    ldwork);
+	    ie = il + ldwork * *m;
+	    itauq = ie + *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+          Bidiagonalize L in WORK(IL).
+          (Workspace: need M*M+5*M, prefer M*M+4*M+2*M*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sgebrd_(m, m, &work[il], &ldwork, &s[1], &work[ie], &work[itauq],
+		    &work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors of L.
+          (Workspace: need M*M+4*M+NRHS, prefer M*M+4*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sormbr_("Q", "L", "T", m, nrhs, m, &work[il], &ldwork, &work[
+		    itauq], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    slalsd_("U", &smlsiz, m, nrhs, &s[1], &work[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &iwork[1], info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of L. */
+
+	    i__1 = *lwork - nwork + 1;
+	    sormbr_("P", "L", "N", m, nrhs, m, &work[il], &ldwork, &work[
+		    itaup], &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Zero out below first M rows of B. */
+
+	    i__1 = *n - *m;
+	    slaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[*m + 1 + b_dim1],
+		    ldb);
+	    nwork = itau + *m;
+
+/*
+          Multiply transpose(Q) by B.
+          (Workspace: need M+NRHS, prefer M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sormlq_("L", "T", n, nrhs, m, &a[a_offset], lda, &work[itau], &b[
+		    b_offset], ldb, &work[nwork], &i__1, info);
+
+	} else {
+
+/*        Path 2 - remaining underdetermined cases. */
+
+	    ie = 1;
+	    itauq = ie + *m;
+	    itaup = itauq + *m;
+	    nwork = itaup + *m;
+
+/*
+          Bidiagonalize A.
+          (Workspace: need 3*M+N, prefer 3*M+(M+N)*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sgebrd_(m, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], &
+		    work[itaup], &work[nwork], &i__1, info);
+
+/*
+          Multiply B by transpose of left bidiagonalizing vectors.
+          (Workspace: need 3*M+NRHS, prefer 3*M+NRHS*NB)
+*/
+
+	    i__1 = *lwork - nwork + 1;
+	    sormbr_("Q", "L", "T", m, nrhs, n, &a[a_offset], lda, &work[itauq]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+/*        Solve the bidiagonal least squares problem. */
+
+	    slalsd_("L", &smlsiz, m, nrhs, &s[1], &work[ie], &b[b_offset],
+		    ldb, rcond, rank, &work[nwork], &iwork[1], info);
+	    if (*info != 0) {
+		goto L10;
+	    }
+
+/*        Multiply B by right bidiagonalizing vectors of A. */
+
+	    i__1 = *lwork - nwork + 1;
+	    sormbr_("P", "L", "N", n, nrhs, m, &a[a_offset], lda, &work[itaup]
+		    , &b[b_offset], ldb, &work[nwork], &i__1, info);
+
+	}
+    }
+
+/*     Undo scaling. */
+
+    if (iascl == 1) {
+	slascl_("G", &c__0, &c__0, &anrm, &smlnum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	slascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    } else if (iascl == 2) {
+	slascl_("G", &c__0, &c__0, &anrm, &bignum, n, nrhs, &b[b_offset], ldb,
+		 info);
+	slascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], &
+		minmn, info);
+    }
+    if (ibscl == 1) {
+	slascl_("G", &c__0, &c__0, &smlnum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    } else if (ibscl == 2) {
+	slascl_("G", &c__0, &c__0, &bignum, &bnrm, n, nrhs, &b[b_offset], ldb,
+		 info);
+    }
+
+L10:
+    work[1] = (real) maxwrk;
+    iwork[1] = liwork;
+    return 0;
+
+/*     End of SGELSD */
+
+} /* sgelsd_ */
+
+/* Subroutine */ int sgeqr2_(integer *m, integer *n, real *a, integer *lda,
+	real *tau, real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+
+    /* Local variables */
+    static integer i__, k;
+    static real aii;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *,
+	    integer *, real *, real *, integer *, real *), xerbla_(
+	    char *, integer *), slarfg_(integer *, real *, real *,
+	    integer *, real *);
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SGEQR2 computes a QR factorization of a real m by n matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
 
     N       (input) INTEGER
             The number of columns of the matrix A.  N >= 0.
 
     A       (input/output) REAL array, dimension (LDA,N)
-            On entry, the M-by-N matrix A.
+            On entry, the m by n matrix A.
             On exit, the elements on and above the diagonal of the array
-            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
+            contain the min(m,n) by n upper trapezoidal matrix R (R is
             upper triangular if m >= n); the elements below the diagonal,
             with the array TAU, represent the orthogonal matrix Q as a
-            product of min(m,n) elementary reflectors (see Further
-            Details).
+            product of elementary reflectors (see Further Details).
 
     LDA     (input) INTEGER
             The leading dimension of the array A.  LDA >= max(1,M).
@@ -4216,22 +4782,11 @@ static real c_b2863 = 2.f;
             The scalar factors of the elementary reflectors (see Further
             Details).
 
-    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
-            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
-
-    LWORK   (input) INTEGER
-            The dimension of the array WORK.  LWORK >= max(1,N).
-            For optimum performance LWORK >= N*NB, where NB is
-            the optimal blocksize.
-
-            If LWORK = -1, then a workspace query is assumed; the routine
-            only calculates the optimal size of the WORK array, returns
-            this value as the first entry of the WORK array, and no error
-            message related to LWORK is issued by XERBLA.
+    WORK    (workspace) REAL array, dimension (N)
 
     INFO    (output) INTEGER
-            = 0:  successful exit
-            < 0:  if INFO = -i, the i-th argument had an illegal value
+            = 0: successful exit
+            < 0: if INFO = -i, the i-th argument had an illegal value
 
     Further Details
     ===============
@@ -4263,68 +4818,218 @@ static real c_b2863 = 2.f;
 
     /* Function Body */
     *info = 0;
-    nb = ilaenv_(&c__1, "SGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
-	    1);
-    lwkopt = *n * nb;
-    work[1] = (real) lwkopt;
-    lquery = *lwork == -1;
     if (*m < 0) {
 	*info = -1;
     } else if (*n < 0) {
 	*info = -2;
     } else if (*lda < max(1,*m)) {
 	*info = -4;
-    } else if (*lwork < max(1,*n) && ! lquery) {
-	*info = -7;
     }
     if (*info != 0) {
 	i__1 = -(*info);
-	xerbla_("SGEQRF", &i__1);
-	return 0;
-    } else if (lquery) {
+	xerbla_("SGEQR2", &i__1);
 	return 0;
     }
 
-/*     Quick return if possible */
-
     k = min(*m,*n);
-    if (k == 0) {
-	work[1] = 1.f;
-	return 0;
-    }
-
-    nbmin = 2;
-    nx = 0;
-    iws = *n;
-    if (nb > 1 && nb < k) {
-
-/*
-          Determine when to cross over from blocked to unblocked code.
 
-   Computing MAX
-*/
-	i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQRF", " ", m, n, &c_n1, &c_n1, (
-		ftnlen)6, (ftnlen)1);
-	nx = max(i__1,i__2);
-	if (nx < k) {
+    i__1 = k;
+    for (i__ = 1; i__ <= i__1; ++i__) {
 
-/*           Determine if workspace is large enough for blocked code. */
+/*        Generate elementary reflector H(i) to annihilate A(i+1:m,i) */
 
-	    ldwork = *n;
-	    iws = ldwork * nb;
-	    if (*lwork < iws) {
+	i__2 = *m - i__ + 1;
+/* Computing MIN */
+	i__3 = i__ + 1;
+	slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1]
+		, &c__1, &tau[i__]);
+	if (i__ < *n) {
 
-/*
-                Not enough workspace to use optimal NB:  reduce NB and
-                determine the minimum value of NB.
-*/
+/*           Apply H(i) to A(i:m,i+1:n) from the left */
 
-		nb = *lwork / ldwork;
-/* Computing MAX */
-		i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQRF", " ", m, n, &c_n1, &
-			c_n1, (ftnlen)6, (ftnlen)1);
-		nbmin = max(i__1,i__2);
-	    }
+	    aii = a[i__ + i__ * a_dim1];
+	    a[i__ + i__ * a_dim1] = 1.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n - i__;
+	    slarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &tau[
+		    i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + i__ * a_dim1] = aii;
+	}
+/* L10: */
+    }
+    return 0;
+
+/*     End of SGEQR2 */
+
+} /* sgeqr2_ */
+
+/* Subroutine */ int sgeqrf_(integer *m, integer *n, real *a, integer *lda,
+	real *tau, real *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+
+    /* Local variables */
+    static integer i__, k, ib, nb, nx, iws, nbmin, iinfo;
+    extern /* Subroutine */ int sgeqr2_(integer *, integer *, real *, integer
+	    *, real *, real *, integer *), slarfb_(char *, char *, char *,
+	    char *, integer *, integer *, integer *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *,
+	    integer *, integer *, ftnlen, ftnlen);
+    extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *,
+	    real *, integer *, real *, real *, integer *);
+    static integer ldwork, lwkopt;
+    static logical lquery;
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SGEQRF computes a QR factorization of a real M-by-N matrix A:
+    A = Q * R.
+
+    Arguments
+    =========
+
+    M       (input) INTEGER
+            The number of rows of the matrix A.  M >= 0.
+
+    N       (input) INTEGER
+            The number of columns of the matrix A.  N >= 0.
+
+    A       (input/output) REAL array, dimension (LDA,N)
+            On entry, the M-by-N matrix A.
+            On exit, the elements on and above the diagonal of the array
+            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
+            upper triangular if m >= n); the elements below the diagonal,
+            with the array TAU, represent the orthogonal matrix Q as a
+            product of min(m,n) elementary reflectors (see Further
+            Details).
+
+    LDA     (input) INTEGER
+            The leading dimension of the array A.  LDA >= max(1,M).
+
+    TAU     (output) REAL array, dimension (min(M,N))
+            The scalar factors of the elementary reflectors (see Further
+            Details).
+
+    WORK    (workspace/output) REAL array, dimension (MAX(1,LWORK))
+            On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+
+    LWORK   (input) INTEGER
+            The dimension of the array WORK.  LWORK >= max(1,N).
+            For optimum performance LWORK >= N*NB, where NB is
+            the optimal blocksize.
+
+            If LWORK = -1, then a workspace query is assumed; the routine
+            only calculates the optimal size of the WORK array, returns
+            this value as the first entry of the WORK array, and no error
+            message related to LWORK is issued by XERBLA.
+
+    INFO    (output) INTEGER
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+
+    Further Details
+    ===============
+
+    The matrix Q is represented as a product of elementary reflectors
+
+       Q = H(1) H(2) . . . H(k), where k = min(m,n).
+
+    Each H(i) has the form
+
+       H(i) = I - tau * v * v'
+
+    where tau is a real scalar, and v is a real vector with
+    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
+    and tau in TAU(i).
+
+    =====================================================================
+
+
+       Test the input arguments
+*/
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1;
+    a -= a_offset;
+    --tau;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    nb = ilaenv_(&c__1, "SGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)
+	    1);
+    lwkopt = *n * nb;
+    work[1] = (real) lwkopt;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*lda < max(1,*m)) {
+	*info = -4;
+    } else if (*lwork < max(1,*n) && ! lquery) {
+	*info = -7;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEQRF", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    k = min(*m,*n);
+    if (k == 0) {
+	work[1] = 1.f;
+	return 0;
+    }
+
+    nbmin = 2;
+    nx = 0;
+    iws = *n;
+    if (nb > 1 && nb < k) {
+
+/*
+          Determine when to cross over from blocked to unblocked code.
+
+   Computing MAX
+*/
+	i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQRF", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)6, (ftnlen)1);
+	nx = max(i__1,i__2);
+	if (nx < k) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    ldwork = *n;
+	    iws = ldwork * nb;
+	    if (*lwork < iws) {
+
+/*
+                Not enough workspace to use optimal NB:  reduce NB and
+                determine the minimum value of NB.
+*/
+
+		nb = *lwork / ldwork;
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQRF", " ", m, n, &c_n1, &
+			c_n1, (ftnlen)6, (ftnlen)1);
+		nbmin = max(i__1,i__2);
+	    }
 	}
     }
 
@@ -14248,6 +14953,1389 @@ logical slaisnan_(real *sin1, real *sin2)
 #undef ci
 
 
+/* Subroutine */ int slals0_(integer *icompq, integer *nl, integer *nr,
+	integer *sqre, integer *nrhs, real *b, integer *ldb, real *bx,
+	integer *ldbx, integer *perm, integer *givptr, integer *givcol,
+	integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real *
+	difl, real *difr, real *z__, integer *k, real *c__, real *s, real *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, b_dim1, b_offset, bx_dim1, bx_offset,
+	    difr_dim1, difr_offset, givnum_dim1, givnum_offset, poles_dim1,
+	    poles_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer i__, j, m, n;
+    static real dj;
+    static integer nlp1;
+    static real temp;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *);
+    extern doublereal snrm2_(integer *, real *, integer *);
+    static real diflj, difrj, dsigj;
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *),
+	    sgemv_(char *, integer *, integer *, real *, real *, integer *,
+	    real *, integer *, real *, real *, integer *), scopy_(
+	    integer *, real *, integer *, real *, integer *);
+    extern doublereal slamc3_(real *, real *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    static real dsigjp;
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *,
+	    real *, integer *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *,
+	    real *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLALS0 applies back the multiplying factors of either the left or the
+    right singular vector matrix of a diagonal matrix appended by a row
+    to the right hand side matrix B in solving the least squares problem
+    using the divide-and-conquer SVD approach.
+
+    For the left singular vector matrix, three types of orthogonal
+    matrices are involved:
+
+    (1L) Givens rotations: the number of such rotations is GIVPTR; the
+         pairs of columns/rows they were applied to are stored in GIVCOL;
+         and the C- and S-values of these rotations are stored in GIVNUM.
+
+    (2L) Permutation. The (NL+1)-st row of B is to be moved to the first
+         row, and for J=2:N, PERM(J)-th row of B is to be moved to the
+         J-th row.
+
+    (3L) The left singular vector matrix of the remaining matrix.
+
+    For the right singular vector matrix, four types of orthogonal
+    matrices are involved:
+
+    (1R) The right singular vector matrix of the remaining matrix.
+
+    (2R) If SQRE = 1, one extra Givens rotation to generate the right
+         null space.
+
+    (3R) The inverse transformation of (2L).
+
+    (4R) The inverse transformation of (1L).
+
+    Arguments
+    =========
+
+    ICOMPQ (input) INTEGER
+           Specifies whether singular vectors are to be computed in
+           factored form:
+           = 0: Left singular vector matrix.
+           = 1: Right singular vector matrix.
+
+    NL     (input) INTEGER
+           The row dimension of the upper block. NL >= 1.
+
+    NR     (input) INTEGER
+           The row dimension of the lower block. NR >= 1.
+
+    SQRE   (input) INTEGER
+           = 0: the lower block is an NR-by-NR square matrix.
+           = 1: the lower block is an NR-by-(NR+1) rectangular matrix.
+
+           The bidiagonal matrix has row dimension N = NL + NR + 1,
+           and column dimension M = N + SQRE.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) REAL array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M. On output, B contains
+           the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B. LDB must be at least
+           max(1,MAX( M, N ) ).
+
+    BX     (workspace) REAL array, dimension ( LDBX, NRHS )
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    PERM   (input) INTEGER array, dimension ( N )
+           The permutations (from deflation and sorting) applied
+           to the two blocks.
+
+    GIVPTR (input) INTEGER
+           The number of Givens rotations which took place in this
+           subproblem.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 )
+           Each pair of numbers indicates a pair of rows/columns
+           involved in a Givens rotation.
+
+    LDGCOL (input) INTEGER
+           The leading dimension of GIVCOL, must be at least N.
+
+    GIVNUM (input) REAL array, dimension ( LDGNUM, 2 )
+           Each number indicates the C or S value used in the
+           corresponding Givens rotation.
+
+    LDGNUM (input) INTEGER
+           The leading dimension of arrays DIFR, POLES and
+           GIVNUM, must be at least K.
+
+    POLES  (input) REAL array, dimension ( LDGNUM, 2 )
+           On entry, POLES(1:K, 1) contains the new singular
+           values obtained from solving the secular equation, and
+           POLES(1:K, 2) is an array containing the poles in the secular
+           equation.
+
+    DIFL   (input) REAL array, dimension ( K ).
+           On entry, DIFL(I) is the distance between I-th updated
+           (undeflated) singular value and the I-th (undeflated) old
+           singular value.
+
+    DIFR   (input) REAL array, dimension ( LDGNUM, 2 ).
+           On entry, DIFR(I, 1) contains the distances between I-th
+           updated (undeflated) singular value and the I+1-th
+           (undeflated) old singular value. And DIFR(I, 2) is the
+           normalizing factor for the I-th right singular vector.
+
+    Z      (input) REAL array, dimension ( K )
+           Contain the components of the deflation-adjusted updating row
+           vector.
+
+    K      (input) INTEGER
+           Contains the dimension of the non-deflated matrix,
+           This is the order of the related secular equation. 1 <= K <=N.
+
+    C      (input) REAL
+           C contains garbage if SQRE =0 and the C-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    S      (input) REAL
+           S contains garbage if SQRE =0 and the S-value of a Givens
+           rotation related to the right null space if SQRE = 1.
+
+    WORK   (workspace) REAL array, dimension ( K )
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    --perm;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    difr_dim1 = *ldgnum;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    poles_dim1 = *ldgnum;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    givnum_dim1 = *ldgnum;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    --difl;
+    --z__;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*nl < 1) {
+	*info = -2;
+    } else if (*nr < 1) {
+	*info = -3;
+    } else if (*sqre < 0 || *sqre > 1) {
+	*info = -4;
+    }
+
+    n = *nl + *nr + 1;
+
+    if (*nrhs < 1) {
+	*info = -5;
+    } else if (*ldb < n) {
+	*info = -7;
+    } else if (*ldbx < n) {
+	*info = -9;
+    } else if (*givptr < 0) {
+	*info = -11;
+    } else if (*ldgcol < n) {
+	*info = -13;
+    } else if (*ldgnum < n) {
+	*info = -15;
+    } else if (*k < 1) {
+	*info = -20;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLALS0", &i__1);
+	return 0;
+    }
+
+    m = n + *sqre;
+    nlp1 = *nl + 1;
+
+    if (*icompq == 0) {
+
+/*
+          Apply back orthogonal transformations from the left.
+
+          Step (1L): apply back the Givens rotations performed.
+*/
+
+	i__1 = *givptr;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    srot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &givnum[i__ + givnum_dim1]);
+/* L10: */
+	}
+
+/*        Step (2L): permute rows of B. */
+
+	scopy_(nrhs, &b[nlp1 + b_dim1], ldb, &bx[bx_dim1 + 1], ldbx);
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    scopy_(nrhs, &b[perm[i__] + b_dim1], ldb, &bx[i__ + bx_dim1],
+		    ldbx);
+/* L20: */
+	}
+
+/*
+          Step (3L): apply the inverse of the left singular vector
+          matrix to BX.
+*/
+
+	if (*k == 1) {
+	    scopy_(nrhs, &bx[bx_offset], ldbx, &b[b_offset], ldb);
+	    if (z__[1] < 0.f) {
+		sscal_(nrhs, &c_b151, &b[b_offset], ldb);
+	    }
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		diflj = difl[j];
+		dj = poles[j + poles_dim1];
+		dsigj = -poles[j + (poles_dim1 << 1)];
+		if (j < *k) {
+		    difrj = -difr[j + difr_dim1];
+		    dsigjp = -poles[j + 1 + (poles_dim1 << 1)];
+		}
+		if (z__[j] == 0.f || poles[j + (poles_dim1 << 1)] == 0.f) {
+		    work[j] = 0.f;
+		} else {
+		    work[j] = -poles[j + (poles_dim1 << 1)] * z__[j] / diflj /
+			     (poles[j + (poles_dim1 << 1)] + dj);
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] ==
+			    0.f) {
+			work[i__] = 0.f;
+		    } else {
+			work[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				/ (slamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigj) - diflj) / (poles[i__ + (poles_dim1 <<
+				1)] + dj);
+		    }
+/* L30: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] ==
+			    0.f) {
+			work[i__] = 0.f;
+		    } else {
+			work[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__]
+				/ (slamc3_(&poles[i__ + (poles_dim1 << 1)], &
+				dsigjp) + difrj) / (poles[i__ + (poles_dim1 <<
+				 1)] + dj);
+		    }
+/* L40: */
+		}
+		work[1] = -1.f;
+		temp = snrm2_(k, &work[1], &c__1);
+		sgemv_("T", k, nrhs, &c_b15, &bx[bx_offset], ldbx, &work[1], &
+			c__1, &c_b29, &b[j + b_dim1], ldb);
+		slascl_("G", &c__0, &c__0, &temp, &c_b15, &c__1, nrhs, &b[j +
+			b_dim1], ldb, info);
+/* L50: */
+	    }
+	}
+
+/*        Move the deflated rows of BX to B also. */
+
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    slacpy_("A", &i__1, nrhs, &bx[*k + 1 + bx_dim1], ldbx, &b[*k + 1
+		    + b_dim1], ldb);
+	}
+    } else {
+
+/*
+          Apply back the right orthogonal transformations.
+
+          Step (1R): apply back the new right singular vector matrix
+          to B.
+*/
+
+	if (*k == 1) {
+	    scopy_(nrhs, &b[b_offset], ldb, &bx[bx_offset], ldbx);
+	} else {
+	    i__1 = *k;
+	    for (j = 1; j <= i__1; ++j) {
+		dsigj = poles[j + (poles_dim1 << 1)];
+		if (z__[j] == 0.f) {
+		    work[j] = 0.f;
+		} else {
+		    work[j] = -z__[j] / difl[j] / (dsigj + poles[j +
+			    poles_dim1]) / difr[j + (difr_dim1 << 1)];
+		}
+		i__2 = j - 1;
+		for (i__ = 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.f) {
+			work[i__] = 0.f;
+		    } else {
+			r__1 = -poles[i__ + 1 + (poles_dim1 << 1)];
+			work[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difr[
+				i__ + difr_dim1]) / (dsigj + poles[i__ +
+				poles_dim1]) / difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L60: */
+		}
+		i__2 = *k;
+		for (i__ = j + 1; i__ <= i__2; ++i__) {
+		    if (z__[j] == 0.f) {
+			work[i__] = 0.f;
+		    } else {
+			r__1 = -poles[i__ + (poles_dim1 << 1)];
+			work[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difl[
+				i__]) / (dsigj + poles[i__ + poles_dim1]) /
+				difr[i__ + (difr_dim1 << 1)];
+		    }
+/* L70: */
+		}
+		sgemv_("T", k, nrhs, &c_b15, &b[b_offset], ldb, &work[1], &
+			c__1, &c_b29, &bx[j + bx_dim1], ldbx);
+/* L80: */
+	    }
+	}
+
+/*
+          Step (2R): if SQRE = 1, apply back the rotation that is
+          related to the right null space of the subproblem.
+*/
+
+	if (*sqre == 1) {
+	    scopy_(nrhs, &b[m + b_dim1], ldb, &bx[m + bx_dim1], ldbx);
+	    srot_(nrhs, &bx[bx_dim1 + 1], ldbx, &bx[m + bx_dim1], ldbx, c__,
+		    s);
+	}
+	if (*k < max(m,n)) {
+	    i__1 = n - *k;
+	    slacpy_("A", &i__1, nrhs, &b[*k + 1 + b_dim1], ldb, &bx[*k + 1 +
+		    bx_dim1], ldbx);
+	}
+
+/*        Step (3R): permute rows of B. */
+
+	scopy_(nrhs, &bx[bx_dim1 + 1], ldbx, &b[nlp1 + b_dim1], ldb);
+	if (*sqre == 1) {
+	    scopy_(nrhs, &bx[m + bx_dim1], ldbx, &b[m + b_dim1], ldb);
+	}
+	i__1 = n;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    scopy_(nrhs, &bx[i__ + bx_dim1], ldbx, &b[perm[i__] + b_dim1],
+		    ldb);
+/* L90: */
+	}
+
+/*        Step (4R): apply back the Givens rotations performed. */
+
+	for (i__ = *givptr; i__ >= 1; --i__) {
+	    r__1 = -givnum[i__ + givnum_dim1];
+	    srot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, &
+		    b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ +
+		    (givnum_dim1 << 1)], &r__1);
+/* L100: */
+	}
+    }
+
+    return 0;
+
+/*     End of SLALS0 */
+
+} /* slals0_ */
+
+/* Subroutine */ int slalsa_(integer *icompq, integer *smlsiz, integer *n,
+	integer *nrhs, real *b, integer *ldb, real *bx, integer *ldbx, real *
+	u, integer *ldu, real *vt, integer *k, real *difl, real *difr, real *
+	z__, real *poles, integer *givptr, integer *givcol, integer *ldgcol,
+	integer *perm, real *givnum, real *c__, real *s, real *work, integer *
+	iwork, integer *info)
+{
+    /* System generated locals */
+    integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, b_dim1,
+	    b_offset, bx_dim1, bx_offset, difl_dim1, difl_offset, difr_dim1,
+	    difr_offset, givnum_dim1, givnum_offset, poles_dim1, poles_offset,
+	     u_dim1, u_offset, vt_dim1, vt_offset, z_dim1, z_offset, i__1,
+	    i__2;
+
+    /* Local variables */
+    static integer i__, j, i1, ic, lf, nd, ll, nl, nr, im1, nlf, nrf, lvl,
+	    ndb1, nlp1, lvl2, nrp1, nlvl, sqre, inode, ndiml;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *,
+	    integer *, real *, real *, integer *, real *, integer *, real *,
+	    real *, integer *);
+    static integer ndimr;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *), slals0_(integer *, integer *, integer *, integer *,
+	    integer *, real *, integer *, real *, integer *, integer *,
+	    integer *, integer *, integer *, real *, integer *, real *, real *
+	    , real *, real *, integer *, real *, real *, real *, integer *),
+	    xerbla_(char *, integer *), slasdt_(integer *, integer *,
+	    integer *, integer *, integer *, integer *, integer *);
+
+
+/*
+    -- LAPACK routine (version 3.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       November 2006
+
+
+    Purpose
+    =======
+
+    SLALSA is an itermediate step in solving the least squares problem
+    by computing the SVD of the coefficient matrix in compact form (The
+    singular vectors are computed as products of simple orthorgonal
+    matrices.).
+
+    If ICOMPQ = 0, SLALSA applies the inverse of the left singular vector
+    matrix of an upper bidiagonal matrix to the right hand side; and if
+    ICOMPQ = 1, SLALSA applies the right singular vector matrix to the
+    right hand side. The singular vector matrices were generated in
+    compact form by SLALSA.
+
+    Arguments
+    =========
+
+
+    ICOMPQ (input) INTEGER
+           Specifies whether the left or the right singular vector
+           matrix is involved.
+           = 0: Left singular vector matrix
+           = 1: Right singular vector matrix
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The row and column dimensions of the upper bidiagonal matrix.
+
+    NRHS   (input) INTEGER
+           The number of columns of B and BX. NRHS must be at least 1.
+
+    B      (input/output) REAL array, dimension ( LDB, NRHS )
+           On input, B contains the right hand sides of the least
+           squares problem in rows 1 through M.
+           On output, B contains the solution X in rows 1 through N.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,MAX( M, N ) ).
+
+    BX     (output) REAL array, dimension ( LDBX, NRHS )
+           On exit, the result of applying the left or right singular
+           vector matrix to B.
+
+    LDBX   (input) INTEGER
+           The leading dimension of BX.
+
+    U      (input) REAL array, dimension ( LDU, SMLSIZ ).
+           On entry, U contains the left singular vector matrices of all
+           subproblems at the bottom level.
+
+    LDU    (input) INTEGER, LDU = > N.
+           The leading dimension of arrays U, VT, DIFL, DIFR,
+           POLES, GIVNUM, and Z.
+
+    VT     (input) REAL array, dimension ( LDU, SMLSIZ+1 ).
+           On entry, VT' contains the right singular vector matrices of
+           all subproblems at the bottom level.
+
+    K      (input) INTEGER array, dimension ( N ).
+
+    DIFL   (input) REAL array, dimension ( LDU, NLVL ).
+           where NLVL = INT(log_2 (N/(SMLSIZ+1))) + 1.
+
+    DIFR   (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, DIFL(*, I) and DIFR(*, 2 * I -1) record
+           distances between singular values on the I-th level and
+           singular values on the (I -1)-th level, and DIFR(*, 2 * I)
+           record the normalizing factors of the right singular vectors
+           matrices of subproblems on I-th level.
+
+    Z      (input) REAL array, dimension ( LDU, NLVL ).
+           On entry, Z(1, I) contains the components of the deflation-
+           adjusted updating row vector for subproblems on the I-th
+           level.
+
+    POLES  (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, POLES(*, 2 * I -1: 2 * I) contains the new and old
+           singular values involved in the secular equations on the I-th
+           level.
+
+    GIVPTR (input) INTEGER array, dimension ( N ).
+           On entry, GIVPTR( I ) records the number of Givens
+           rotations performed on the I-th problem on the computation
+           tree.
+
+    GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 * NLVL ).
+           On entry, for each I, GIVCOL(*, 2 * I - 1: 2 * I) records the
+           locations of Givens rotations performed on the I-th level on
+           the computation tree.
+
+    LDGCOL (input) INTEGER, LDGCOL = > N.
+           The leading dimension of arrays GIVCOL and PERM.
+
+    PERM   (input) INTEGER array, dimension ( LDGCOL, NLVL ).
+           On entry, PERM(*, I) records permutations done on the I-th
+           level of the computation tree.
+
+    GIVNUM (input) REAL array, dimension ( LDU, 2 * NLVL ).
+           On entry, GIVNUM(*, 2 *I -1 : 2 * I) records the C- and S-
+           values of Givens rotations performed on the I-th level on the
+           computation tree.
+
+    C      (input) REAL array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           C( I ) contains the C-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    S      (input) REAL array, dimension ( N ).
+           On entry, if the I-th subproblem is not square,
+           S( I ) contains the S-value of a Givens rotation related to
+           the right null space of the I-th subproblem.
+
+    WORK   (workspace) REAL array.
+           The dimension must be at least N.
+
+    IWORK  (workspace) INTEGER array.
+           The dimension must be at least 3 * N
+
+    INFO   (output) INTEGER
+            = 0:  successful exit.
+            < 0:  if INFO = -i, the i-th argument had an illegal value.
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    bx_dim1 = *ldbx;
+    bx_offset = 1 + bx_dim1;
+    bx -= bx_offset;
+    givnum_dim1 = *ldu;
+    givnum_offset = 1 + givnum_dim1;
+    givnum -= givnum_offset;
+    poles_dim1 = *ldu;
+    poles_offset = 1 + poles_dim1;
+    poles -= poles_offset;
+    z_dim1 = *ldu;
+    z_offset = 1 + z_dim1;
+    z__ -= z_offset;
+    difr_dim1 = *ldu;
+    difr_offset = 1 + difr_dim1;
+    difr -= difr_offset;
+    difl_dim1 = *ldu;
+    difl_offset = 1 + difl_dim1;
+    difl -= difl_offset;
+    vt_dim1 = *ldu;
+    vt_offset = 1 + vt_dim1;
+    vt -= vt_offset;
+    u_dim1 = *ldu;
+    u_offset = 1 + u_dim1;
+    u -= u_offset;
+    --k;
+    --givptr;
+    perm_dim1 = *ldgcol;
+    perm_offset = 1 + perm_dim1;
+    perm -= perm_offset;
+    givcol_dim1 = *ldgcol;
+    givcol_offset = 1 + givcol_dim1;
+    givcol -= givcol_offset;
+    --c__;
+    --s;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*icompq < 0 || *icompq > 1) {
+	*info = -1;
+    } else if (*smlsiz < 3) {
+	*info = -2;
+    } else if (*n < *smlsiz) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < *n) {
+	*info = -6;
+    } else if (*ldbx < *n) {
+	*info = -8;
+    } else if (*ldu < *n) {
+	*info = -10;
+    } else if (*ldgcol < *n) {
+	*info = -19;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLALSA", &i__1);
+	return 0;
+    }
+
+/*     Book-keeping and  setting up the computation tree. */
+
+    inode = 1;
+    ndiml = inode + *n;
+    ndimr = ndiml + *n;
+
+    slasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr],
+	    smlsiz);
+
+/*
+       The following code applies back the left singular vector factors.
+       For applying back the right singular vector factors, go to 50.
+*/
+
+    if (*icompq == 1) {
+	goto L50;
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by SLASDQ. The corresponding left and right singular vector
+       matrices are in explicit form. First apply back the left
+       singular vector matrices.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+
+/*
+          IC : center row of each node
+          NL : number of rows of left  subproblem
+          NR : number of rows of right subproblem
+          NLF: starting row of the left   subproblem
+          NRF: starting row of the right  subproblem
+*/
+
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlf = ic - nl;
+	nrf = ic + 1;
+	sgemm_("T", "N", &nl, nrhs, &nl, &c_b15, &u[nlf + u_dim1], ldu, &b[
+		nlf + b_dim1], ldb, &c_b29, &bx[nlf + bx_dim1], ldbx);
+	sgemm_("T", "N", &nr, nrhs, &nr, &c_b15, &u[nrf + u_dim1], ldu, &b[
+		nrf + b_dim1], ldb, &c_b29, &bx[nrf + bx_dim1], ldbx);
+/* L10: */
+    }
+
+/*
+       Next copy the rows of B that correspond to unchanged rows
+       in the bidiagonal matrix to BX.
+*/
+
+    i__1 = nd;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	ic = iwork[inode + i__ - 1];
+	scopy_(nrhs, &b[ic + b_dim1], ldb, &bx[ic + bx_dim1], ldbx);
+/* L20: */
+    }
+
+/*
+       Finally go through the left singular vector matrices of all
+       the other subproblems bottom-up on the tree.
+*/
+
+    j = pow_ii(&c__2, &nlvl);
+    sqre = 0;
+
+    for (lvl = nlvl; lvl >= 1; --lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          find the first node LF and last node LL on
+          the current level LVL
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__1 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__1);
+	    ll = (lf << 1) - 1;
+	}
+	i__1 = ll;
+	for (i__ = lf; i__ <= i__1; ++i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    --j;
+	    slals0_(icompq, &nl, &nr, &sqre, nrhs, &bx[nlf + bx_dim1], ldbx, &
+		    b[nlf + b_dim1], ldb, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &work[1], info);
+/* L30: */
+	}
+/* L40: */
+    }
+    goto L90;
+
+/*     ICOMPQ = 1: applying back the right singular vector factors. */
+
+L50:
+
+/*
+       First now go through the right singular vector matrices of all
+       the tree nodes top-down.
+*/
+
+    j = 0;
+    i__1 = nlvl;
+    for (lvl = 1; lvl <= i__1; ++lvl) {
+	lvl2 = (lvl << 1) - 1;
+
+/*
+          Find the first node LF and last node LL on
+          the current level LVL.
+*/
+
+	if (lvl == 1) {
+	    lf = 1;
+	    ll = 1;
+	} else {
+	    i__2 = lvl - 1;
+	    lf = pow_ii(&c__2, &i__2);
+	    ll = (lf << 1) - 1;
+	}
+	i__2 = lf;
+	for (i__ = ll; i__ >= i__2; --i__) {
+	    im1 = i__ - 1;
+	    ic = iwork[inode + im1];
+	    nl = iwork[ndiml + im1];
+	    nr = iwork[ndimr + im1];
+	    nlf = ic - nl;
+	    nrf = ic + 1;
+	    if (i__ == ll) {
+		sqre = 0;
+	    } else {
+		sqre = 1;
+	    }
+	    ++j;
+	    slals0_(icompq, &nl, &nr, &sqre, nrhs, &b[nlf + b_dim1], ldb, &bx[
+		    nlf + bx_dim1], ldbx, &perm[nlf + lvl * perm_dim1], &
+		    givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, &
+		    givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 *
+		     poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf +
+		    lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[
+		    j], &s[j], &work[1], info);
+/* L60: */
+	}
+/* L70: */
+    }
+
+/*
+       The nodes on the bottom level of the tree were solved
+       by SLASDQ. The corresponding right singular vector
+       matrices are in explicit form. Apply them back.
+*/
+
+    ndb1 = (nd + 1) / 2;
+    i__1 = nd;
+    for (i__ = ndb1; i__ <= i__1; ++i__) {
+	i1 = i__ - 1;
+	ic = iwork[inode + i1];
+	nl = iwork[ndiml + i1];
+	nr = iwork[ndimr + i1];
+	nlp1 = nl + 1;
+	if (i__ == nd) {
+	    nrp1 = nr;
+	} else {
+	    nrp1 = nr + 1;
+	}
+	nlf = ic - nl;
+	nrf = ic + 1;
+	sgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b15, &vt[nlf + vt_dim1], ldu,
+		&b[nlf + b_dim1], ldb, &c_b29, &bx[nlf + bx_dim1], ldbx);
+	sgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b15, &vt[nrf + vt_dim1], ldu,
+		&b[nrf + b_dim1], ldb, &c_b29, &bx[nrf + bx_dim1], ldbx);
+/* L80: */
+    }
+
+L90:
+
+    return 0;
+
+/*     End of SLALSA */
+
+} /* slalsa_ */
+
+/* Subroutine */ int slalsd_(char *uplo, integer *smlsiz, integer *n, integer
+	*nrhs, real *d__, real *e, real *b, integer *ldb, real *rcond,
+	integer *rank, real *work, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer b_dim1, b_offset, i__1, i__2;
+    real r__1;
+
+    /* Local variables */
+    static integer c__, i__, j, k;
+    static real r__;
+    static integer s, u, z__;
+    static real cs;
+    static integer bx;
+    static real sn;
+    static integer st, vt, nm1, st1;
+    static real eps;
+    static integer iwk;
+    static real tol;
+    static integer difl, difr;
+    static real rcnd;
+    static integer perm, nsub, nlvl, sqre, bxst;
+    extern /* Subroutine */ int srot_(integer *, real *, integer *, real *,
+	    integer *, real *, real *), sgemm_(char *, char *, integer *,
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    , real *, real *, integer *);
+    static integer poles, sizei, nsize;
+    extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *,
+	    integer *);
+    static integer nwork, icmpq1, icmpq2;
+    extern doublereal slamch_(char *);
+    extern /* Subroutine */ int slasda_(integer *, integer *, integer *,
+	    integer *, real *, real *, real *, integer *, real *, integer *,
+	    real *, real *, real *, real *, integer *, integer *, integer *,
+	    integer *, real *, real *, real *, real *, integer *, integer *),
+	    xerbla_(char *, integer *), slalsa_(integer *, integer *,
+	    integer *, integer *, real *, integer *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, real *, real *, real *,
+	    integer *, integer *, integer *, integer *, real *, real *, real *
+	    , real *, integer *, integer *), slascl_(char *, integer *,
+	    integer *, real *, real *, integer *, integer *, real *, integer *
+	    , integer *);
+    static integer givcol;
+    extern integer isamax_(integer *, real *, integer *);
+    extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer
+	    *, integer *, integer *, real *, real *, real *, integer *, real *
+	    , integer *, real *, integer *, real *, integer *),
+	    slacpy_(char *, integer *, integer *, real *, integer *, real *,
+	    integer *), slartg_(real *, real *, real *, real *, real *
+	    ), slaset_(char *, integer *, integer *, real *, real *, real *,
+	    integer *);
+    static real orgnrm;
+    static integer givnum;
+    extern doublereal slanst_(char *, integer *, real *, real *);
+    extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *);
+    static integer givptr, smlszp;
+
+
+/*
+    -- LAPACK routine (version 3.2.2) --
+    -- LAPACK is a software package provided by Univ. of Tennessee,    --
+    -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+       June 2010
+
+
+    Purpose
+    =======
+
+    SLALSD uses the singular value decomposition of A to solve the least
+    squares problem of finding X to minimize the Euclidean norm of each
+    column of A*X-B, where A is N-by-N upper bidiagonal, and X and B
+    are N-by-NRHS. The solution X overwrites B.
+
+    The singular values of A smaller than RCOND times the largest
+    singular value are treated as zero in solving the least squares
+    problem; in this case a minimum norm solution is returned.
+    The actual singular values are returned in D in ascending order.
+
+    This code makes very mild assumptions about floating point
+    arithmetic. It will work on machines with a guard digit in
+    add/subtract, or on those binary machines without guard digits
+    which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
+    It could conceivably fail on hexadecimal or decimal machines
+    without guard digits, but we know of none.
+
+    Arguments
+    =========
+
+    UPLO   (input) CHARACTER*1
+           = 'U': D and E define an upper bidiagonal matrix.
+           = 'L': D and E define a  lower bidiagonal matrix.
+
+    SMLSIZ (input) INTEGER
+           The maximum size of the subproblems at the bottom of the
+           computation tree.
+
+    N      (input) INTEGER
+           The dimension of the  bidiagonal matrix.  N >= 0.
+
+    NRHS   (input) INTEGER
+           The number of columns of B. NRHS must be at least 1.
+
+    D      (input/output) REAL array, dimension (N)
+           On entry D contains the main diagonal of the bidiagonal
+           matrix. On exit, if INFO = 0, D contains its singular values.
+
+    E      (input/output) REAL array, dimension (N-1)
+           Contains the super-diagonal entries of the bidiagonal matrix.
+           On exit, E has been destroyed.
+
+    B      (input/output) REAL array, dimension (LDB,NRHS)
+           On input, B contains the right hand sides of the least
+           squares problem. On output, B contains the solution X.
+
+    LDB    (input) INTEGER
+           The leading dimension of B in the calling subprogram.
+           LDB must be at least max(1,N).
+
+    RCOND  (input) REAL
+           The singular values of A less than or equal to RCOND times
+           the largest singular value are treated as zero in solving
+           the least squares problem. If RCOND is negative,
+           machine precision is used instead.
+           For example, if diag(S)*X=B were the least squares problem,
+           where diag(S) is a diagonal matrix of singular values, the
+           solution would be X(i) = B(i) / S(i) if S(i) is greater than
+           RCOND*max(S), and X(i) = 0 if S(i) is less than or equal to
+           RCOND*max(S).
+
+    RANK   (output) INTEGER
+           The number of singular values of A greater than RCOND times
+           the largest singular value.
+
+    WORK   (workspace) REAL array, dimension at least
+           (9*N + 2*N*SMLSIZ + 8*N*NLVL + N*NRHS + (SMLSIZ+1)**2),
+           where NLVL = max(0, INT(log_2 (N/(SMLSIZ+1))) + 1).
+
+    IWORK  (workspace) INTEGER array, dimension at least
+           (3*N*NLVL + 11*N)
+
+    INFO   (output) INTEGER
+           = 0:  successful exit.
+           < 0:  if INFO = -i, the i-th argument had an illegal value.
+           > 0:  The algorithm failed to compute a singular value while
+                 working on the submatrix lying in rows and columns
+                 INFO/(N+1) through MOD(INFO,N+1).
+
+    Further Details
+    ===============
+
+    Based on contributions by
+       Ming Gu and Ren-Cang Li, Computer Science Division, University of
+         California at Berkeley, USA
+       Osni Marques, LBNL/NERSC, USA
+
+    =====================================================================
+
+
+       Test the input parameters.
+*/
+
+    /* Parameter adjustments */
+    --d__;
+    --e;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1;
+    b -= b_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+    if (*n < 0) {
+	*info = -3;
+    } else if (*nrhs < 1) {
+	*info = -4;
+    } else if (*ldb < 1 || *ldb < *n) {
+	*info = -8;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SLALSD", &i__1);
+	return 0;
+    }
+
+    eps = slamch_("Epsilon");
+
+/*     Set up the tolerance. */
+
+    if (*rcond <= 0.f || *rcond >= 1.f) {
+	rcnd = eps;
+    } else {
+	rcnd = *rcond;
+    }
+
+    *rank = 0;
+
+/*     Quick return if possible. */
+
+    if (*n == 0) {
+	return 0;
+    } else if (*n == 1) {
+	if (d__[1] == 0.f) {
+	    slaset_("A", &c__1, nrhs, &c_b29, &c_b29, &b[b_offset], ldb);
+	} else {
+	    *rank = 1;
+	    slascl_("G", &c__0, &c__0, &d__[1], &c_b15, &c__1, nrhs, &b[
+		    b_offset], ldb, info);
+	    d__[1] = dabs(d__[1]);
+	}
+	return 0;
+    }
+
+/*     Rotate the matrix if it is lower bidiagonal. */
+
+    if (*(unsigned char *)uplo == 'L') {
+	i__1 = *n - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    slartg_(&d__[i__], &e[i__], &cs, &sn, &r__);
+	    d__[i__] = r__;
+	    e[i__] = sn * d__[i__ + 1];
+	    d__[i__ + 1] = cs * d__[i__ + 1];
+	    if (*nrhs == 1) {
+		srot_(&c__1, &b[i__ + b_dim1], &c__1, &b[i__ + 1 + b_dim1], &
+			c__1, &cs, &sn);
+	    } else {
+		work[(i__ << 1) - 1] = cs;
+		work[i__ * 2] = sn;
+	    }
+/* L10: */
+	}
+	if (*nrhs > 1) {
+	    i__1 = *nrhs;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = *n - 1;
+		for (j = 1; j <= i__2; ++j) {
+		    cs = work[(j << 1) - 1];
+		    sn = work[j * 2];
+		    srot_(&c__1, &b[j + i__ * b_dim1], &c__1, &b[j + 1 + i__ *
+			     b_dim1], &c__1, &cs, &sn);
+/* L20: */
+		}
+/* L30: */
+	    }
+	}
+    }
+
+/*     Scale. */
+
+    nm1 = *n - 1;
+    orgnrm = slanst_("M", n, &d__[1], &e[1]);
+    if (orgnrm == 0.f) {
+	slaset_("A", n, nrhs, &c_b29, &c_b29, &b[b_offset], ldb);
+	return 0;
+    }
+
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, &c__1, &d__[1], n, info);
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &nm1, &c__1, &e[1], &nm1,
+	    info);
+
+/*
+       If N is smaller than the minimum divide size SMLSIZ, then solve
+       the problem with another solver.
+*/
+
+    if (*n <= *smlsiz) {
+	nwork = *n * *n + 1;
+	slaset_("A", n, n, &c_b29, &c_b15, &work[1], n);
+	slasdq_("U", &c__0, n, n, &c__0, nrhs, &d__[1], &e[1], &work[1], n, &
+		work[1], n, &b[b_offset], ldb, &work[nwork], info);
+	if (*info != 0) {
+	    return 0;
+	}
+	tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1));
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (d__[i__] <= tol) {
+		slaset_("A", &c__1, nrhs, &c_b29, &c_b29, &b[i__ + b_dim1],
+			ldb);
+	    } else {
+		slascl_("G", &c__0, &c__0, &d__[i__], &c_b15, &c__1, nrhs, &b[
+			i__ + b_dim1], ldb, info);
+		++(*rank);
+	    }
+/* L40: */
+	}
+	sgemm_("T", "N", n, nrhs, n, &c_b15, &work[1], n, &b[b_offset], ldb, &
+		c_b29, &work[nwork], n);
+	slacpy_("A", n, nrhs, &work[nwork], n, &b[b_offset], ldb);
+
+/*        Unscale. */
+
+	slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n,
+		info);
+	slasrt_("D", n, &d__[1], info);
+	slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, nrhs, &b[b_offset],
+		ldb, info);
+
+	return 0;
+    }
+
+/*     Book-keeping and setting up some constants. */
+
+    nlvl = (integer) (log((real) (*n) / (real) (*smlsiz + 1)) / log(2.f)) + 1;
+
+    smlszp = *smlsiz + 1;
+
+    u = 1;
+    vt = *smlsiz * *n + 1;
+    difl = vt + smlszp * *n;
+    difr = difl + nlvl * *n;
+    z__ = difr + (nlvl * *n << 1);
+    c__ = z__ + nlvl * *n;
+    s = c__ + *n;
+    poles = s + *n;
+    givnum = poles + (nlvl << 1) * *n;
+    bx = givnum + (nlvl << 1) * *n;
+    nwork = bx + *n * *nrhs;
+
+    sizei = *n + 1;
+    k = sizei + *n;
+    givptr = k + *n;
+    perm = givptr + *n;
+    givcol = perm + nlvl * *n;
+    iwk = givcol + (nlvl * *n << 1);
+
+    st = 1;
+    sqre = 0;
+    icmpq1 = 1;
+    icmpq2 = 0;
+    nsub = 0;
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = d__[i__], dabs(r__1)) < eps) {
+	    d__[i__] = r_sign(&eps, &d__[i__]);
+	}
+/* L50: */
+    }
+
+    i__1 = nm1;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if ((r__1 = e[i__], dabs(r__1)) < eps || i__ == nm1) {
+	    ++nsub;
+	    iwork[nsub] = st;
+
+/*
+             Subproblem found. First determine its size and then
+             apply divide and conquer on it.
+*/
+
+	    if (i__ < nm1) {
+
+/*              A subproblem with E(I) small for I < NM1. */
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else if ((r__1 = e[i__], dabs(r__1)) >= eps) {
+
+/*              A subproblem with E(NM1) not too small but I = NM1. */
+
+		nsize = *n - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+	    } else {
+
+/*
+                A subproblem with E(NM1) small. This implies an
+                1-by-1 subproblem at D(N), which is not solved
+                explicitly.
+*/
+
+		nsize = i__ - st + 1;
+		iwork[sizei + nsub - 1] = nsize;
+		++nsub;
+		iwork[nsub] = *n;
+		iwork[sizei + nsub - 1] = 1;
+		scopy_(nrhs, &b[*n + b_dim1], ldb, &work[bx + nm1], n);
+	    }
+	    st1 = st - 1;
+	    if (nsize == 1) {
+
+/*
+                This is a 1-by-1 subproblem and is not solved
+                explicitly.
+*/
+
+		scopy_(nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n);
+	    } else if (nsize <= *smlsiz) {
+
+/*              This is a small subproblem and is solved by SLASDQ. */
+
+		slaset_("A", &nsize, &nsize, &c_b29, &c_b15, &work[vt + st1],
+			n);
+		slasdq_("U", &c__0, &nsize, &nsize, &c__0, nrhs, &d__[st], &e[
+			st], &work[vt + st1], n, &work[nwork], n, &b[st +
+			b_dim1], ldb, &work[nwork], info);
+		if (*info != 0) {
+		    return 0;
+		}
+		slacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx +
+			st1], n);
+	    } else {
+
+/*              A large problem. Solve it using divide and conquer. */
+
+		slasda_(&icmpq1, smlsiz, &nsize, &sqre, &d__[st], &e[st], &
+			work[u + st1], n, &work[vt + st1], &iwork[k + st1], &
+			work[difl + st1], &work[difr + st1], &work[z__ + st1],
+			 &work[poles + st1], &iwork[givptr + st1], &iwork[
+			givcol + st1], n, &iwork[perm + st1], &work[givnum +
+			st1], &work[c__ + st1], &work[s + st1], &work[nwork],
+			&iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+		bxst = bx + st1;
+		slalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, &
+			work[bxst], n, &work[u + st1], n, &work[vt + st1], &
+			iwork[k + st1], &work[difl + st1], &work[difr + st1],
+			&work[z__ + st1], &work[poles + st1], &iwork[givptr +
+			st1], &iwork[givcol + st1], n, &iwork[perm + st1], &
+			work[givnum + st1], &work[c__ + st1], &work[s + st1],
+			&work[nwork], &iwork[iwk], info);
+		if (*info != 0) {
+		    return 0;
+		}
+	    }
+	    st = i__ + 1;
+	}
+/* L60: */
+    }
+
+/*     Apply the singular values and treat the tiny ones as zero. */
+
+    tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1));
+
+    i__1 = *n;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+
+/*
+          Some of the elements in D can be negative because 1-by-1
+          subproblems were not solved explicitly.
+*/
+
+	if ((r__1 = d__[i__], dabs(r__1)) <= tol) {
+	    slaset_("A", &c__1, nrhs, &c_b29, &c_b29, &work[bx + i__ - 1], n);
+	} else {
+	    ++(*rank);
+	    slascl_("G", &c__0, &c__0, &d__[i__], &c_b15, &c__1, nrhs, &work[
+		    bx + i__ - 1], n, info);
+	}
+	d__[i__] = (r__1 = d__[i__], dabs(r__1));
+/* L70: */
+    }
+
+/*     Now apply back the right singular vectors. */
+
+    icmpq2 = 1;
+    i__1 = nsub;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	st = iwork[i__];
+	st1 = st - 1;
+	nsize = iwork[sizei + i__ - 1];
+	bxst = bx + st1;
+	if (nsize == 1) {
+	    scopy_(nrhs, &work[bxst], n, &b[st + b_dim1], ldb);
+	} else if (nsize <= *smlsiz) {
+	    sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b15, &work[vt + st1], n,
+		     &work[bxst], n, &c_b29, &b[st + b_dim1], ldb);
+	} else {
+	    slalsa_(&icmpq2, smlsiz, &nsize, nrhs, &work[bxst], n, &b[st +
+		    b_dim1], ldb, &work[u + st1], n, &work[vt + st1], &iwork[
+		    k + st1], &work[difl + st1], &work[difr + st1], &work[z__
+		    + st1], &work[poles + st1], &iwork[givptr + st1], &iwork[
+		    givcol + st1], n, &iwork[perm + st1], &work[givnum + st1],
+		     &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[
+		    iwk], info);
+	    if (*info != 0) {
+		return 0;
+	    }
+	}
+/* L80: */
+    }
+
+/*     Unscale and sort the singular values. */
+
+    slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n, info);
+    slasrt_("D", n, &d__[1], info);
+    slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, nrhs, &b[b_offset], ldb,
+	    info);
+
+    return 0;
+
+/*     End of SLALSD */
+
+} /* slalsd_ */
+
 /* Subroutine */ int slamrg_(integer *n1, integer *n2, real *a, integer *
 	strd1, integer *strd2, integer *index)
 {
@@ -29679,7 +31767,7 @@ doublereal slapy3_(real *x, real *y, real *z__)
 /*              Note that M is very tiny */
 
 		if (l == 0.f) {
-		    t = r_sign(&c_b2863, &ft) * r_sign(&c_b15, &gt);
+		    t = r_sign(&c_b3178, &ft) * r_sign(&c_b15, &gt);
 		} else {
 		    t = gt / r_sign(&d__, &ft) + m / t;
 		}
diff --git a/numpy/linalg/lapack_lite/wrapped_routines b/numpy/linalg/lapack_lite/wrapped_routines
index e949890979d2..0d99c724d23f 100644
--- a/numpy/linalg/lapack_lite/wrapped_routines
+++ b/numpy/linalg/lapack_lite/wrapped_routines
@@ -1,5 +1,6 @@
 ccopy
 cgeev
+cgelsd
 cgemm
 cgesdd
 cgesv
@@ -23,6 +24,7 @@ dpotrs
 dsyevd
 scopy
 sgeev
+sgelsd
 sgemm
 sgesdd
 sgesv
diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py
index de25d25e9a39..f073abadf505 100644
--- a/numpy/linalg/linalg.py
+++ b/numpy/linalg/linalg.py
@@ -97,6 +97,9 @@ def _raise_linalgerror_eigenvalues_nonconvergence(err, flag):
 def _raise_linalgerror_svd_nonconvergence(err, flag):
     raise LinAlgError("SVD did not converge")
 
+def _raise_linalgerror_lstsq(err, flag):
+    raise LinAlgError("SVD did not converge in Linear Least Squares")
+
 def get_linalg_error_extobj(callback):
     extobj = list(_linalg_error_extobj)  # make a copy
     extobj[2] = callback
@@ -1997,7 +2000,6 @@ def lstsq(a, b, rcond="warn"):
     >>> plt.show()
 
     """
-    import math
     a, _ = _makearray(a)
     b, wrap = _makearray(b)
     is_1d = b.ndim == 1
@@ -2008,7 +2010,6 @@ def lstsq(a, b, rcond="warn"):
     m  = a.shape[0]
     n  = a.shape[1]
     n_rhs = b.shape[1]
-    ldb = max(n, m)
     if m != b.shape[0]:
         raise LinAlgError('Incompatible dimensions')
 
@@ -2028,62 +2029,25 @@ def lstsq(a, b, rcond="warn"):
                       FutureWarning, stacklevel=2)
         rcond = -1
     if rcond is None:
-        rcond = finfo(t).eps * ldb
-
-    bstar = zeros((ldb, n_rhs), t)
-    bstar[:m, :n_rhs] = b
-    a, bstar = _fastCopyAndTranspose(t, a, bstar)
-    a, bstar = _to_native_byte_order(a, bstar)
-    s = zeros((min(m, n),), real_t)
-    # This line:
-    #  * is incorrect, according to the LAPACK documentation
-    #  * raises a ValueError if min(m,n) == 0
-    #  * should not be calculated here anyway, as LAPACK should calculate
-    #    `liwork` for us. But that only works if our version of lapack does
-    #    not have this bug:
-    #      http://icl.cs.utk.edu/lapack-forum/archives/lapack/msg00899.html
-    #    Lapack_lite does have that bug...
-    nlvl = max( 0, int( math.log( float(min(m, n))/2. ) ) + 1 )
-    iwork = zeros((3*min(m, n)*nlvl+11*min(m, n),), fortran_int)
-    if isComplexType(t):
-        lapack_routine = lapack_lite.zgelsd
-        lwork = 1
-        rwork = zeros((lwork,), real_t)
-        work = zeros((lwork,), t)
-        results = lapack_routine(m, n, n_rhs, a, m, bstar, ldb, s, rcond,
-                                 0, work, -1, rwork, iwork, 0)
-        lrwork = int(rwork[0])
-        lwork = int(work[0].real)
-        work = zeros((lwork,), t)
-        rwork = zeros((lrwork,), real_t)
-        results = lapack_routine(m, n, n_rhs, a, m, bstar, ldb, s, rcond,
-                                 0, work, lwork, rwork, iwork, 0)
+        rcond = finfo(t).eps * max(n, m)
+
+    if m <= n:
+        gufunc = _umath_linalg.lstsq_m
     else:
-        lapack_routine = lapack_lite.dgelsd
-        lwork = 1
-        work = zeros((lwork,), t)
-        results = lapack_routine(m, n, n_rhs, a, m, bstar, ldb, s, rcond,
-                                 0, work, -1, iwork, 0)
-        lwork = int(work[0])
-        work = zeros((lwork,), t)
-        results = lapack_routine(m, n, n_rhs, a, m, bstar, ldb, s, rcond,
-                                 0, work, lwork, iwork, 0)
-    if results['info'] > 0:
-        raise LinAlgError('SVD did not converge in Linear Least Squares')
-
-    # undo transpose imposed by fortran-order arrays
-    b_out = bstar.T
+        gufunc = _umath_linalg.lstsq_n
+
+    signature = 'DDd->Did' if isComplexType(t) else 'ddd->did'
+    extobj = get_linalg_error_extobj(_raise_linalgerror_lstsq)
+    b_out, rank, s = gufunc(a, b, rcond, signature=signature, extobj=extobj)
 
     # b_out contains both the solution and the components of the residuals
-    x = b_out[:n,:]
-    r_parts = b_out[n:,:]
+    x = b_out[...,:n,:]
+    r_parts = b_out[...,n:,:]
     if isComplexType(t):
         resids = sum(abs(r_parts)**2, axis=-2)
     else:
         resids = sum(r_parts**2, axis=-2)
 
-    rank = results['rank']
-
     # remove the axis we added
     if is_1d:
         x = x.squeeze(axis=-1)
diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src
index 0248518ac7cb..d8cfdf6acd43 100644
--- a/numpy/linalg/umath_linalg.c.src
+++ b/numpy/linalg/umath_linalg.c.src
@@ -129,12 +129,26 @@ FNAME(zheevd)(char *jobz, char *uplo, int *n,
               int *info);
 
 extern int
+FNAME(sgelsd)(int *m, int *n, int *nrhs,
+              float a[], int *lda, float b[], int *ldb,
+              float s[], float *rcond, int *rank,
+              float work[], int *lwork, int iwork[],
+              int *info);
+extern int
 FNAME(dgelsd)(int *m, int *n, int *nrhs,
               double a[], int *lda, double b[], int *ldb,
               double s[], double *rcond, int *rank,
               double work[], int *lwork, int iwork[],
               int *info);
 extern int
+FNAME(cgelsd)(int *m, int *n, int *nrhs,
+              f2c_complex a[], int *lda,
+              f2c_complex b[], int *ldb,
+              float s[], float *rcond, int *rank,
+              f2c_complex work[], int *lwork,
+              float rwork[], int iwork[],
+              int *info);
+extern int
 FNAME(zgelsd)(int *m, int *n, int *nrhs,
               f2c_doublecomplex a[], int *lda,
               f2c_doublecomplex b[], int *ldb,
@@ -492,6 +506,7 @@ static void init_constants(void)
  * columns: number of columns in the matrix
  * row_strides: the number bytes between consecutive rows.
  * column_strides: the number of bytes between consecutive columns.
+ * output_lead_dim: BLAS/LAPACK-side leading dimension, in elements
  */
 typedef struct linearize_data_struct
 {
@@ -499,8 +514,24 @@ typedef struct linearize_data_struct
   npy_intp columns;
   npy_intp row_strides;
   npy_intp column_strides;
+  npy_intp output_lead_dim;
 } LINEARIZE_DATA_t;
 
+static NPY_INLINE void
+init_linearize_data_ex(LINEARIZE_DATA_t *lin_data,
+                       npy_intp rows,
+                       npy_intp columns,
+                       npy_intp row_strides,
+                       npy_intp column_strides,
+                       npy_intp output_lead_dim)
+{
+    lin_data->rows = rows;
+    lin_data->columns = columns;
+    lin_data->row_strides = row_strides;
+    lin_data->column_strides = column_strides;
+    lin_data->output_lead_dim = output_lead_dim;
+}
+
 static NPY_INLINE void
 init_linearize_data(LINEARIZE_DATA_t *lin_data,
                     npy_intp rows,
@@ -508,10 +539,8 @@ init_linearize_data(LINEARIZE_DATA_t *lin_data,
                     npy_intp row_strides,
                     npy_intp column_strides)
 {
-    lin_data->rows = rows;
-    lin_data->columns = columns;
-    lin_data->row_strides = row_strides;
-    lin_data->column_strides = column_strides;
+    init_linearize_data_ex(
+        lin_data, rows, columns, row_strides, column_strides, columns);
 }
 
 static NPY_INLINE void
@@ -846,7 +875,7 @@ linearize_@TYPE@_matrix(void *dst_in,
                 }
             }
             src += data->row_strides/sizeof(@typ@);
-            dst += data->columns;
+            dst += data->output_lead_dim;
         }
         return rv;
     } else {
@@ -893,7 +922,7 @@ delinearize_@TYPE@_matrix(void *dst_in,
                            sizeof(@typ@));
                 }
             }
-            src += data->columns;
+            src += data->output_lead_dim;
             dst += data->row_strides/sizeof(@typ@);
         }
 
@@ -2871,6 +2900,359 @@ static void
 
 /**end repeat**/
 
+
+/* -------------------------------------------------------------------------- */
+                 /* least squares */
+
+typedef struct gelsd_params_struct
+{
+    fortran_int M;
+    fortran_int N;
+    fortran_int NRHS;
+    void *A;
+    fortran_int LDA;
+    void *B;
+    fortran_int LDB;
+    void *S;
+    void *RCOND;
+    fortran_int RANK;
+    void *WORK;
+    fortran_int LWORK;
+    void *RWORK;
+    void *IWORK;
+} GELSD_PARAMS_t;
+
+
+static inline void
+dump_gelsd_params(const char *name,
+                  GELSD_PARAMS_t *params)
+{
+    TRACE_TXT("\n%s:\n"\
+
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+              "%14s: %18p\n"\
+
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+              "%14s: %18d\n"\
+
+              "%14s: %18p\n",
+
+              name,
+
+              "A", params->A,
+              "B", params->B,
+              "S", params->S,
+              "WORK", params->WORK,
+              "RWORK", params->RWORK,
+              "IWORK", params->IWORK,
+
+              "M", (int)params->M,
+              "N", (int)params->N,
+              "NRHS", (int)params->NRHS,
+              "LDA", (int)params->LDA,
+              "LDB", (int)params->LDB,
+              "LWORK", (int)params->LWORK,
+              "RANK", (int)params->RANK,
+
+              "RCOND", params->RCOND);
+}
+
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE#
+   #lapack_func=sgelsd,dgelsd#
+   #ftyp=fortran_real,fortran_doublereal#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GELSD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->IWORK,
+                          &rv);
+    return rv;
+}
+
+static inline int
+init_@lapack_func@(GELSD_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n,
+                   fortran_int nrhs)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *b, *s, *work, *iwork;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    fortran_int max_m_n = fortran_int_max(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_max_m_n = max_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t safe_nrhs = nrhs;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
+    size_t s_size = safe_min_m_n * sizeof(@ftyp@);
+
+    fortran_int work_count;
+    size_t work_size;
+    size_t iwork_size;
+    fortran_int lda = fortran_int_max(1, m);
+    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
+
+    mem_buff = malloc(a_size + b_size + s_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    b = a + a_size;
+    s = b + b_size;
+
+
+    params->M = m;
+    params->N = n;
+    params->NRHS = nrhs;
+    params->A = a;
+    params->B = b;
+    params->S = s;
+    params->LDA = lda;
+    params->LDB = ldb;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+        fortran_int iwork_size_query;
+
+        params->WORK = &work_size_query;
+        params->IWORK = &iwork_size_query;
+        params->RWORK = NULL;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query;
+
+        work_size  = (size_t) work_size_query * sizeof(@ftyp@);
+        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
+    }
+
+    mem_buff2 = malloc(work_size + iwork_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    iwork = work + work_size;
+
+    params->WORK = work;
+    params->RWORK = NULL;
+    params->IWORK = iwork;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+/**begin repeat
+   #TYPE=CFLOAT,CDOUBLE#
+   #ftyp=fortran_complex,fortran_doublecomplex#
+   #frealtyp=fortran_real,fortran_doublereal#
+   #typ=COMPLEX_t,DOUBLECOMPLEX_t#
+   #lapack_func=cgelsd,zgelsd#
+ */
+
+static inline fortran_int
+call_@lapack_func@(GELSD_PARAMS_t *params)
+{
+    fortran_int rv;
+    LAPACK(@lapack_func@)(&params->M, &params->N, &params->NRHS,
+                          params->A, &params->LDA,
+                          params->B, &params->LDB,
+                          params->S,
+                          params->RCOND, &params->RANK,
+                          params->WORK, &params->LWORK,
+                          params->RWORK, params->IWORK,
+                          &rv);
+    return rv;
+}
+
+static inline int
+init_@lapack_func@(GELSD_PARAMS_t *params,
+                   fortran_int m,
+                   fortran_int n,
+                   fortran_int nrhs)
+{
+    npy_uint8 *mem_buff = NULL;
+    npy_uint8 *mem_buff2 = NULL;
+    npy_uint8 *a, *b, *s, *work, *iwork, *rwork;
+    fortran_int min_m_n = fortran_int_min(m, n);
+    fortran_int max_m_n = fortran_int_max(m, n);
+    size_t safe_min_m_n = min_m_n;
+    size_t safe_max_m_n = max_m_n;
+    size_t safe_m = m;
+    size_t safe_n = n;
+    size_t safe_nrhs = nrhs;
+
+    size_t a_size = safe_m * safe_n * sizeof(@ftyp@);
+    size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@);
+    size_t s_size = safe_min_m_n * sizeof(@frealtyp@);
+
+    fortran_int work_count;
+    size_t work_size, rwork_size, iwork_size;
+    fortran_int lda = fortran_int_max(1, m);
+    fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n));
+
+    mem_buff = malloc(a_size + b_size + s_size);
+
+    if (!mem_buff)
+        goto error;
+
+    a = mem_buff;
+    b = a + a_size;
+    s = b + b_size;
+
+
+    params->M = m;
+    params->N = n;
+    params->NRHS = nrhs;
+    params->A = a;
+    params->B = b;
+    params->S = s;
+    params->LDA = lda;
+    params->LDB = ldb;
+
+    {
+        /* compute optimal work size */
+        @ftyp@ work_size_query;
+        @frealtyp@ rwork_size_query;
+        fortran_int iwork_size_query;
+
+        params->WORK = &work_size_query;
+        params->IWORK = &iwork_size_query;
+        params->RWORK = &rwork_size_query;
+        params->LWORK = -1;
+
+        if (call_@lapack_func@(params) != 0)
+            goto error;
+
+        work_count = (fortran_int)work_size_query.r;
+
+        work_size  = (size_t )work_size_query.r * sizeof(@ftyp@);
+        rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@);
+        iwork_size = (size_t)iwork_size_query * sizeof(fortran_int);
+    }
+
+    mem_buff2 = malloc(work_size + rwork_size + iwork_size);
+    if (!mem_buff2)
+        goto error;
+
+    work = mem_buff2;
+    rwork = work + work_size;
+    iwork = rwork + rwork_size;
+
+    params->WORK = work;
+    params->RWORK = rwork;
+    params->IWORK = iwork;
+    params->LWORK = work_count;
+
+    return 1;
+ error:
+    TRACE_TXT("%s failed init\n", __FUNCTION__);
+    free(mem_buff);
+    free(mem_buff2);
+    memset(params, 0, sizeof(*params));
+
+    return 0;
+}
+
+/**end repeat**/
+
+
+/**begin repeat
+   #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE#
+   #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE#
+   #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd#
+ */
+static inline void
+release_@lapack_func@(GELSD_PARAMS_t* params)
+{
+    /* A and WORK contain allocated blocks */
+    free(params->A);
+    free(params->WORK);
+    memset(params, 0, sizeof(*params));
+}
+
+static void
+@TYPE@_lstsq(char **args, npy_intp *dimensions, npy_intp *steps,
+             void *NPY_UNUSED(func))
+{
+    GELSD_PARAMS_t params;
+    int error_occurred = get_fp_invalid_and_clear();
+    fortran_int n, m, nrhs;
+    INIT_OUTER_LOOP_6
+
+    m = (fortran_int)dimensions[0];
+    n = (fortran_int)dimensions[1];
+    nrhs = (fortran_int)dimensions[2];
+
+    if (init_@lapack_func@(&params, m, n, nrhs)) {
+        LINEARIZE_DATA_t a_in, b_in, x_out, s_out;
+
+        init_linearize_data(&a_in, n, m, steps[1], steps[0]);
+        init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m));
+        init_linearize_data(&x_out, nrhs, fortran_int_max(n, m), steps[5], steps[4]);
+        init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[6]);
+
+        BEGIN_OUTER_LOOP_6
+            int not_ok;
+            linearize_@TYPE@_matrix(params.A, args[0], &a_in);
+            linearize_@TYPE@_matrix(params.B, args[1], &b_in);
+            params.RCOND = args[2];
+            not_ok = call_@lapack_func@(&params);
+            if (!not_ok) {
+                delinearize_@TYPE@_matrix(args[3], params.B, &x_out);
+                *(npy_int*) args[4] = params.RANK;
+                delinearize_@REALTYPE@_matrix(args[5], params.S, &s_out);
+            } else {
+                error_occurred = 1;
+                nan_@TYPE@_matrix(args[3], &x_out);
+                *(npy_int*) args[4] = -1;
+                nan_@REALTYPE@_matrix(args[5], &s_out);
+            }
+        END_OUTER_LOOP
+
+        release_@lapack_func@(&params);
+    }
+
+    set_fp_invalid_or_clear(error_occurred);
+}
+
+/**end repeat**/
+
 #pragma GCC diagnostic pop
 
 /* -------------------------------------------------------------------------- */
@@ -2941,6 +3323,7 @@ GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S);
 GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A);
+GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq);
 GUFUNC_FUNC_ARRAY_EIG(eig);
 GUFUNC_FUNC_ARRAY_EIG(eigvals);
 
@@ -3006,6 +3389,14 @@ static char svd_1_3_types[] = {
     NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE
 };
 
+/*  A,           b,           rcond,      x,           rank,    s        */
+static char lstsq_types[] = {
+    NPY_FLOAT,   NPY_FLOAT,   NPY_FLOAT,  NPY_FLOAT,   NPY_INT, NPY_FLOAT,
+    NPY_DOUBLE,  NPY_DOUBLE,  NPY_DOUBLE, NPY_DOUBLE,  NPY_INT, NPY_DOUBLE,
+    NPY_CFLOAT,  NPY_CFLOAT,  NPY_FLOAT,  NPY_CFLOAT,  NPY_INT, NPY_FLOAT,
+    NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_INT, NPY_DOUBLE
+};
+
 typedef struct gufunc_descriptor_struct {
     char *name;
     char *signature;
@@ -3192,12 +3583,29 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = {
         "eigvals",
         "(m,m)->(m)",
         "eigvals on the last two dimension and broadcast to the rest. \n"\
-        "Results in a vector of eigenvalues. \n"\
-        "    \"(m,m)->(m),(m,m)\" \n",
+        "Results in a vector of eigenvalues. \n",
         3, 1, 1,
         FUNC_ARRAY_NAME(eigvals),
         eigvals_types
     },
+    {
+        "lstsq_m",
+        "(m,n),(m,nrhs),()->(n,nrhs),(),(m)",
+        "least squares on the last two dimensions and broadcast to the rest. \n"\
+        "For m <= n. \n",
+        4, 3, 3,
+        FUNC_ARRAY_NAME(lstsq),
+        lstsq_types
+    },
+    {
+        "lstsq_n",
+        "(m,n),(m,nrhs),()->(m,nrhs),(),(n)",
+        "least squares on the last two dimensions and broadcast to the rest. \n"\
+        "For m >= n. \n",
+        4, 3, 3,
+        FUNC_ARRAY_NAME(lstsq),
+        lstsq_types
+    }
 };
 
 static void