diff --git a/numpy/linalg/lapack_lite/f2c_c_lapack.c b/numpy/linalg/lapack_lite/f2c_c_lapack.c index 25221ba55fdb..f52e1e1572c9 100644 --- a/numpy/linalg/lapack_lite/f2c_c_lapack.c +++ b/numpy/linalg/lapack_lite/f2c_c_lapack.c @@ -37,19 +37,20 @@ static integer c__3 = 3; static integer c__2 = 2; static integer c__0 = 0; static integer c__65 = 65; -static real c_b894 = 1.f; +static integer c__9 = 9; +static integer c__6 = 6; +static real c_b328 = 0.f; +static real c_b1034 = 1.f; static integer c__12 = 12; static integer c__49 = 49; -static real c_b1087 = 0.f; -static integer c__9 = 9; -static real c_b1136 = -1.f; +static real c_b1276 = -1.f; static integer c__13 = 13; static integer c__15 = 15; static integer c__14 = 14; static integer c__16 = 16; static logical c_false = FALSE_; static logical c_true = TRUE_; -static real c_b2023 = .5f; +static real c_b2435 = .5f; /* Subroutine */ int cgebak_(char *job, char *side, integer *n, integer *ilo, integer *ihi, real *scale, integer *m, complex *v, integer *ldv, @@ -2684,176 +2685,764 @@ static real c_b2023 = .5f; } /* cgelqf_ */ -/* Subroutine */ int cgeqr2_(integer *m, integer *n, complex *a, integer *lda, - complex *tau, complex *work, integer *info) +/* Subroutine */ int cgelsd_(integer *m, integer *n, integer *nrhs, complex * + a, integer *lda, complex *b, integer *ldb, real *s, real *rcond, + integer *rank, complex *work, integer *lwork, real *rwork, integer * + iwork, integer *info) { /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3; - complex q__1; + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4; /* Local variables */ - static integer i__, k; - static complex alpha; - extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * - , integer *, complex *, complex *, integer *, complex *), - clarfg_(integer *, complex *, complex *, integer *, complex *), - xerbla_(char *, integer *); + static integer ie, il, mm; + static real eps, anrm, bnrm; + static integer itau, nlvl, iascl, ibscl; + static real sfmin; + static integer minmn, maxmn, itaup, itauq, mnthr, nwork; + extern /* Subroutine */ int cgebrd_(integer *, integer *, complex *, + integer *, real *, real *, complex *, complex *, complex *, + integer *, integer *), slabad_(real *, real *); + extern doublereal clange_(char *, integer *, integer *, complex *, + integer *, real *); + extern /* Subroutine */ int cgelqf_(integer *, integer *, complex *, + integer *, complex *, complex *, integer *, integer *), clalsd_( + char *, integer *, integer *, integer *, real *, real *, complex * + , integer *, real *, integer *, complex *, real *, integer *, + integer *), clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *), cgeqrf_(integer *, integer *, complex *, integer *, + complex *, complex *, integer *, integer *); + extern doublereal slamch_(char *); + extern /* Subroutine */ int clacpy_(char *, integer *, integer *, complex + *, integer *, complex *, integer *), claset_(char *, + integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + static real bignum; + extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *), cunmbr_(char *, char *, char *, integer *, integer *, + integer *, complex *, integer *, complex *, complex *, integer *, + complex *, integer *, integer *), slaset_( + char *, integer *, integer *, real *, real *, real *, integer *), cunmlq_(char *, char *, integer *, integer *, integer *, + complex *, integer *, complex *, complex *, integer *, complex *, + integer *, integer *); + static integer ldwork; + extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, + integer *, complex *, integer *, complex *, complex *, integer *, + complex *, integer *, integer *); + static integer liwork, minwrk, maxwrk; + static real smlnum; + static integer lrwork; + static logical lquery; + static integer nrwork, smlsiz; /* - -- LAPACK routine (version 3.2.2) -- + -- LAPACK driver routine (version 3.2) -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - June 2010 + November 2006 Purpose ======= - CGEQR2 computes a QR factorization of a complex m by n matrix A: - A = Q * R. + CGELSD computes the minimum-norm solution to a real linear least + squares problem: + minimize 2-norm(| b - A*x |) + using the singular value decomposition (SVD) of A. A is an M-by-N + matrix which may be rank-deficient. + + Several right hand side vectors b and solution vectors x can be + handled in a single call; they are stored as the columns of the + M-by-NRHS right hand side matrix B and the N-by-NRHS solution + matrix X. + + The problem is solved in three steps: + (1) Reduce the coefficient matrix A to bidiagonal form with + Householder tranformations, reducing the original problem + into a "bidiagonal least squares problem" (BLS) + (2) Solve the BLS using a divide and conquer approach. + (3) Apply back all the Householder tranformations to solve + the original least squares problem. + + The effective rank of A is determined by treating as zero those + singular values which are less than RCOND times the largest singular + value. + + The divide and conquer algorithm makes very mild assumptions about + floating point arithmetic. It will work on machines with a guard + digit in add/subtract, or on those binary machines without guard + digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or + Cray-2. It could conceivably fail on hexadecimal or decimal machines + without guard digits, but we know of none. Arguments ========= M (input) INTEGER - The number of rows of the matrix A. M >= 0. + The number of rows of the matrix A. M >= 0. N (input) INTEGER - The number of columns of the matrix A. N >= 0. + The number of columns of the matrix A. N >= 0. + + NRHS (input) INTEGER + The number of right hand sides, i.e., the number of columns + of the matrices B and X. NRHS >= 0. A (input/output) COMPLEX array, dimension (LDA,N) - On entry, the m by n matrix A. - On exit, the elements on and above the diagonal of the array - contain the min(m,n) by n upper trapezoidal matrix R (R is - upper triangular if m >= n); the elements below the diagonal, - with the array TAU, represent the unitary matrix Q as a - product of elementary reflectors (see Further Details). + On entry, the M-by-N matrix A. + On exit, A has been destroyed. LDA (input) INTEGER - The leading dimension of the array A. LDA >= max(1,M). + The leading dimension of the array A. LDA >= max(1,M). - TAU (output) COMPLEX array, dimension (min(M,N)) - The scalar factors of the elementary reflectors (see Further - Details). + B (input/output) COMPLEX array, dimension (LDB,NRHS) + On entry, the M-by-NRHS right hand side matrix B. + On exit, B is overwritten by the N-by-NRHS solution matrix X. + If m >= n and RANK = n, the residual sum-of-squares for + the solution in the i-th column is given by the sum of + squares of the modulus of elements n+1:m in that column. - WORK (workspace) COMPLEX array, dimension (N) + LDB (input) INTEGER + The leading dimension of the array B. LDB >= max(1,M,N). - INFO (output) INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value + S (output) REAL array, dimension (min(M,N)) + The singular values of A in decreasing order. + The condition number of A in the 2-norm = S(1)/S(min(m,n)). - Further Details - =============== + RCOND (input) REAL + RCOND is used to determine the effective rank of A. + Singular values S(i) <= RCOND*S(1) are treated as zero. + If RCOND < 0, machine precision is used instead. - The matrix Q is represented as a product of elementary reflectors + RANK (output) INTEGER + The effective rank of A, i.e., the number of singular values + which are greater than RCOND*S(1). - Q = H(1) H(2) . . . H(k), where k = min(m,n). + WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) + On exit, if INFO = 0, WORK(1) returns the optimal LWORK. - Each H(i) has the form + LWORK (input) INTEGER + The dimension of the array WORK. LWORK must be at least 1. + The exact minimum amount of workspace needed depends on M, + N and NRHS. As long as LWORK is at least + 2 * N + N * NRHS + if M is greater than or equal to N or + 2 * M + M * NRHS + if M is less than N, the code will execute correctly. + For good performance, LWORK should generally be larger. - H(i) = I - tau * v * v' + If LWORK = -1, then a workspace query is assumed; the routine + only calculates the optimal size of the array WORK and the + minimum sizes of the arrays RWORK and IWORK, and returns + these values as the first entries of the WORK, RWORK and + IWORK arrays, and no error message related to LWORK is issued + by XERBLA. - where tau is a complex scalar, and v is a complex vector with - v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), - and tau in TAU(i). + RWORK (workspace) REAL array, dimension (MAX(1,LRWORK)) + LRWORK >= + 10*N + 2*N*SMLSIZ + 8*N*NLVL + 3*SMLSIZ*NRHS + + MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS ) + if M is greater than or equal to N or + 10*M + 2*M*SMLSIZ + 8*M*NLVL + 3*SMLSIZ*NRHS + + MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS ) + if M is less than N, the code will execute correctly. + SMLSIZ is returned by ILAENV and is equal to the maximum + size of the subproblems at the bottom of the computation + tree (usually about 25), and + NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 ) + On exit, if INFO = 0, RWORK(1) returns the minimum LRWORK. + + IWORK (workspace) INTEGER array, dimension (MAX(1,LIWORK)) + LIWORK >= max(1, 3*MINMN*NLVL + 11*MINMN), + where MINMN = MIN( M,N ). + On exit, if INFO = 0, IWORK(1) returns the minimum LIWORK. + + INFO (output) INTEGER + = 0: successful exit + < 0: if INFO = -i, the i-th argument had an illegal value. + > 0: the algorithm for computing the SVD failed to converge; + if INFO = i, i off-diagonal elements of an intermediate + bidiagonal form did not converge to zero. + + Further Details + =============== + + Based on contributions by + Ming Gu and Ren-Cang Li, Computer Science Division, University of + California at Berkeley, USA + Osni Marques, LBNL/NERSC, USA ===================================================================== - Test the input arguments + Test the input arguments. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; - --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + --s; --work; + --rwork; + --iwork; /* Function Body */ *info = 0; + minmn = min(*m,*n); + maxmn = max(*m,*n); + lquery = *lwork == -1; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; + } else if (*nrhs < 0) { + *info = -3; } else if (*lda < max(1,*m)) { - *info = -4; - } - if (*info != 0) { - i__1 = -(*info); - xerbla_("CGEQR2", &i__1); - return 0; + *info = -5; + } else if (*ldb < max(1,maxmn)) { + *info = -7; } - k = min(*m,*n); +/* + Compute workspace. + (Note: Comments in the code beginning "Workspace:" describe the + minimal amount of workspace needed at that point in the code, + as well as the preferred amount for good performance. + NB refers to the optimal block size for the immediately + following subroutine, as returned by ILAENV.) +*/ - i__1 = k; - for (i__ = 1; i__ <= i__1; ++i__) { + if (*info == 0) { + minwrk = 1; + maxwrk = 1; + liwork = 1; + lrwork = 1; + if (minmn > 0) { + smlsiz = ilaenv_(&c__9, "CGELSD", " ", &c__0, &c__0, &c__0, &c__0, + (ftnlen)6, (ftnlen)1); + mnthr = ilaenv_(&c__6, "CGELSD", " ", m, n, nrhs, &c_n1, (ftnlen) + 6, (ftnlen)1); +/* Computing MAX */ + i__1 = (integer) (log((real) minmn / (real) (smlsiz + 1)) / log( + 2.f)) + 1; + nlvl = max(i__1,0); + liwork = minmn * 3 * nlvl + minmn * 11; + mm = *m; + if (*m >= *n && *m >= mnthr) { -/* Generate elementary reflector H(i) to annihilate A(i+1:m,i) */ +/* + Path 1a - overdetermined, with many more rows than + columns. +*/ - i__2 = *m - i__ + 1; -/* Computing MIN */ - i__3 = i__ + 1; - clarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1] - , &c__1, &tau[i__]); - if (i__ < *n) { + mm = *n; +/* Computing MAX */ + i__1 = maxwrk, i__2 = *n * ilaenv_(&c__1, "CGEQRF", " ", m, n, + &c_n1, &c_n1, (ftnlen)6, (ftnlen)1); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *nrhs * ilaenv_(&c__1, "CUNMQR", "LC", + m, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)2); + maxwrk = max(i__1,i__2); + } + if (*m >= *n) { -/* Apply H(i)' to A(i:m,i+1:n) from the left */ +/* + Path 1 - overdetermined or exactly determined. - i__2 = i__ + i__ * a_dim1; - alpha.r = a[i__2].r, alpha.i = a[i__2].i; - i__2 = i__ + i__ * a_dim1; - a[i__2].r = 1.f, a[i__2].i = 0.f; - i__2 = *m - i__ + 1; - i__3 = *n - i__; - r_cnjg(&q__1, &tau[i__]); - clarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &q__1, - &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]); - i__2 = i__ + i__ * a_dim1; - a[i__2].r = alpha.r, a[i__2].i = alpha.i; + Computing MAX + Computing 2nd power +*/ + i__3 = smlsiz + 1; + i__1 = i__3 * i__3, i__2 = *n * (*nrhs + 1) + (*nrhs << 1); + lrwork = *n * 10 + (*n << 1) * smlsiz + (*n << 3) * nlvl + + smlsiz * 3 * *nrhs + max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = (*n << 1) + (mm + *n) * ilaenv_(&c__1, + "CGEBRD", " ", &mm, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = (*n << 1) + *nrhs * ilaenv_(&c__1, + "CUNMBR", "QLC", &mm, nrhs, n, &c_n1, (ftnlen)6, ( + ftnlen)3); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = (*n << 1) + (*n - 1) * ilaenv_(&c__1, + "CUNMBR", "PLN", n, nrhs, n, &c_n1, (ftnlen)6, ( + ftnlen)3); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = (*n << 1) + *n * *nrhs; + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = (*n << 1) + mm, i__2 = (*n << 1) + *n * *nrhs; + minwrk = max(i__1,i__2); + } + if (*n > *m) { +/* + Computing MAX + Computing 2nd power +*/ + i__3 = smlsiz + 1; + i__1 = i__3 * i__3, i__2 = *n * (*nrhs + 1) + (*nrhs << 1); + lrwork = *m * 10 + (*m << 1) * smlsiz + (*m << 3) * nlvl + + smlsiz * 3 * *nrhs + max(i__1,i__2); + if (*n >= mnthr) { + +/* + Path 2a - underdetermined, with many more columns + than rows. +*/ + + maxwrk = *m + *m * ilaenv_(&c__1, "CGELQF", " ", m, n, & + c_n1, &c_n1, (ftnlen)6, (ftnlen)1); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m << 1) * + ilaenv_(&c__1, "CGEBRD", " ", m, m, &c_n1, &c_n1, + (ftnlen)6, (ftnlen)1); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *nrhs * + ilaenv_(&c__1, "CUNMBR", "QLC", m, nrhs, m, &c_n1, + (ftnlen)6, (ftnlen)3); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m - 1) * + ilaenv_(&c__1, "CUNMLQ", "LC", n, nrhs, m, &c_n1, + (ftnlen)6, (ftnlen)2); + maxwrk = max(i__1,i__2); + if (*nrhs > 1) { +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + *m + *m * *nrhs; + maxwrk = max(i__1,i__2); + } else { +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 1); + maxwrk = max(i__1,i__2); + } +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *m * *nrhs; + maxwrk = max(i__1,i__2); +/* + XXX: Ensure the Path 2a case below is triggered. The workspace + calculation should use queries for all routines eventually. + Computing MAX + Computing MAX +*/ + i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), + i__3 = max(i__3,*nrhs), i__4 = *n - *m * 3; + i__1 = maxwrk, i__2 = (*m << 2) + *m * *m + max(i__3,i__4) + ; + maxwrk = max(i__1,i__2); + } else { + +/* Path 2 - underdetermined. */ + + maxwrk = (*m << 1) + (*n + *m) * ilaenv_(&c__1, "CGEBRD", + " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1); +/* Computing MAX */ + i__1 = maxwrk, i__2 = (*m << 1) + *nrhs * ilaenv_(&c__1, + "CUNMBR", "QLC", m, nrhs, m, &c_n1, (ftnlen)6, ( + ftnlen)3); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = (*m << 1) + *m * ilaenv_(&c__1, + "CUNMBR", "PLN", n, nrhs, m, &c_n1, (ftnlen)6, ( + ftnlen)3); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = (*m << 1) + *m * *nrhs; + maxwrk = max(i__1,i__2); + } +/* Computing MAX */ + i__1 = (*m << 1) + *n, i__2 = (*m << 1) + *m * *nrhs; + minwrk = max(i__1,i__2); + } + } + minwrk = min(minwrk,maxwrk); + work[1].r = (real) maxwrk, work[1].i = 0.f; + iwork[1] = liwork; + rwork[1] = (real) lrwork; + + if (*lwork < minwrk && ! lquery) { + *info = -12; } -/* L10: */ } - return 0; -/* End of CGEQR2 */ + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGELSD", &i__1); + return 0; + } else if (lquery) { + return 0; + } -} /* cgeqr2_ */ +/* Quick return if possible. */ -/* Subroutine */ int cgeqrf_(integer *m, integer *n, complex *a, integer *lda, - complex *tau, complex *work, integer *lwork, integer *info) -{ - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + if (*m == 0 || *n == 0) { + *rank = 0; + return 0; + } - /* Local variables */ - static integer i__, k, ib, nb, nx, iws, nbmin, iinfo; - extern /* Subroutine */ int cgeqr2_(integer *, integer *, complex *, - integer *, complex *, complex *, integer *), clarfb_(char *, char - *, char *, char *, integer *, integer *, integer *, complex *, - integer *, complex *, integer *, complex *, integer *, complex *, - integer *), clarft_(char *, char * - , integer *, integer *, complex *, integer *, complex *, complex * - , integer *), xerbla_(char *, integer *); - extern integer ilaenv_(integer *, char *, char *, integer *, integer *, - integer *, integer *, ftnlen, ftnlen); - static integer ldwork, lwkopt; - static logical lquery; +/* Get machine parameters. */ + eps = slamch_("P"); + sfmin = slamch_("S"); + smlnum = sfmin / eps; + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); -/* - -- LAPACK routine (version 3.2) -- - -- LAPACK is a software package provided by Univ. of Tennessee, -- - -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - November 2006 +/* Scale A if max entry outside range [SMLNUM,BIGNUM]. */ + anrm = clange_("M", m, n, &a[a_offset], lda, &rwork[1]); + iascl = 0; + if (anrm > 0.f && anrm < smlnum) { - Purpose - ======= +/* Scale matrix norm up to SMLNUM */ - CGEQRF computes a QR factorization of a complex M-by-N matrix A: + clascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { + +/* Scale matrix norm down to BIGNUM. */ + + clascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.f) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = max(*m,*n); + claset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[b_offset], ldb); + slaset_("F", &minmn, &c__1, &c_b328, &c_b328, &s[1], &c__1) + ; + *rank = 0; + goto L10; + } + +/* Scale B if max entry outside range [SMLNUM,BIGNUM]. */ + + bnrm = clange_("M", m, nrhs, &b[b_offset], ldb, &rwork[1]); + ibscl = 0; + if (bnrm > 0.f && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM. */ + + clascl_("G", &c__0, &c__0, &bnrm, &smlnum, m, nrhs, &b[b_offset], ldb, + info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM. */ + + clascl_("G", &c__0, &c__0, &bnrm, &bignum, m, nrhs, &b[b_offset], ldb, + info); + ibscl = 2; + } + +/* If M < N make sure B(M+1:N,:) = 0 */ + + if (*m < *n) { + i__1 = *n - *m; + claset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[*m + 1 + b_dim1], ldb); + } + +/* Overdetermined case. */ + + if (*m >= *n) { + +/* Path 1 - overdetermined or exactly determined. */ + + mm = *m; + if (*m >= mnthr) { + +/* Path 1a - overdetermined, with many more rows than columns */ + + mm = *n; + itau = 1; + nwork = itau + *n; + +/* + Compute A=Q*R. + (RWorkspace: need N) + (CWorkspace: need N, prefer N*NB) +*/ + + i__1 = *lwork - nwork + 1; + cgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1, + info); + +/* + Multiply B by transpose(Q). + (RWorkspace: need N) + (CWorkspace: need NRHS, prefer NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + cunmqr_("L", "C", m, nrhs, n, &a[a_offset], lda, &work[itau], &b[ + b_offset], ldb, &work[nwork], &i__1, info); + +/* Zero out below R. */ + + if (*n > 1) { + i__1 = *n - 1; + i__2 = *n - 1; + claset_("L", &i__1, &i__2, &c_b56, &c_b56, &a[a_dim1 + 2], + lda); + } + } + + itauq = 1; + itaup = itauq + *n; + nwork = itaup + *n; + ie = 1; + nrwork = ie + *n; + +/* + Bidiagonalize R in A. + (RWorkspace: need N) + (CWorkspace: need 2*N+MM, prefer 2*N+(MM+N)*NB) +*/ + + i__1 = *lwork - nwork + 1; + cgebrd_(&mm, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq], & + work[itaup], &work[nwork], &i__1, info); + +/* + Multiply B by transpose of left bidiagonalizing vectors of R. + (CWorkspace: need 2*N+NRHS, prefer 2*N+NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + cunmbr_("Q", "L", "C", &mm, nrhs, n, &a[a_offset], lda, &work[itauq], + &b[b_offset], ldb, &work[nwork], &i__1, info); + +/* Solve the bidiagonal least squares problem. */ + + clalsd_("U", &smlsiz, n, nrhs, &s[1], &rwork[ie], &b[b_offset], ldb, + rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1], info); + if (*info != 0) { + goto L10; + } + +/* Multiply B by right bidiagonalizing vectors of R. */ + + i__1 = *lwork - nwork + 1; + cunmbr_("P", "L", "N", n, nrhs, n, &a[a_offset], lda, &work[itaup], & + b[b_offset], ldb, &work[nwork], &i__1, info); + + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = *m, i__2 = (*m << 1) - 4, i__1 = max(i__1,i__2), i__1 = max( + i__1,*nrhs), i__2 = *n - *m * 3; + if (*n >= mnthr && *lwork >= (*m << 2) + *m * *m + max(i__1,i__2)) { + +/* + Path 2a - underdetermined, with many more columns than rows + and sufficient workspace for an efficient algorithm. +*/ + + ldwork = *m; +/* + Computing MAX + Computing MAX +*/ + i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), i__3 = + max(i__3,*nrhs), i__4 = *n - *m * 3; + i__1 = (*m << 2) + *m * *lda + max(i__3,i__4), i__2 = *m * *lda + + *m + *m * *nrhs; + if (*lwork >= max(i__1,i__2)) { + ldwork = *lda; + } + itau = 1; + nwork = *m + 1; + +/* + Compute A=L*Q. + (CWorkspace: need 2*M, prefer M+M*NB) +*/ + + i__1 = *lwork - nwork + 1; + cgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1, + info); + il = nwork; + +/* Copy L to WORK(IL), zeroing out above its diagonal. */ + + clacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwork); + i__1 = *m - 1; + i__2 = *m - 1; + claset_("U", &i__1, &i__2, &c_b56, &c_b56, &work[il + ldwork], & + ldwork); + itauq = il + ldwork * *m; + itaup = itauq + *m; + nwork = itaup + *m; + ie = 1; + nrwork = ie + *m; + +/* + Bidiagonalize L in WORK(IL). + (RWorkspace: need M) + (CWorkspace: need M*M+4*M, prefer M*M+4*M+2*M*NB) +*/ + + i__1 = *lwork - nwork + 1; + cgebrd_(m, m, &work[il], &ldwork, &s[1], &rwork[ie], &work[itauq], + &work[itaup], &work[nwork], &i__1, info); + +/* + Multiply B by transpose of left bidiagonalizing vectors of L. + (CWorkspace: need M*M+4*M+NRHS, prefer M*M+4*M+NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + cunmbr_("Q", "L", "C", m, nrhs, m, &work[il], &ldwork, &work[ + itauq], &b[b_offset], ldb, &work[nwork], &i__1, info); + +/* Solve the bidiagonal least squares problem. */ + + clalsd_("U", &smlsiz, m, nrhs, &s[1], &rwork[ie], &b[b_offset], + ldb, rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1], + info); + if (*info != 0) { + goto L10; + } + +/* Multiply B by right bidiagonalizing vectors of L. */ + + i__1 = *lwork - nwork + 1; + cunmbr_("P", "L", "N", m, nrhs, m, &work[il], &ldwork, &work[ + itaup], &b[b_offset], ldb, &work[nwork], &i__1, info); + +/* Zero out below first M rows of B. */ + + i__1 = *n - *m; + claset_("F", &i__1, nrhs, &c_b56, &c_b56, &b[*m + 1 + b_dim1], + ldb); + nwork = itau + *m; + +/* + Multiply transpose(Q) by B. + (CWorkspace: need NRHS, prefer NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + cunmlq_("L", "C", n, nrhs, m, &a[a_offset], lda, &work[itau], &b[ + b_offset], ldb, &work[nwork], &i__1, info); + + } else { + +/* Path 2 - remaining underdetermined cases. */ + + itauq = 1; + itaup = itauq + *m; + nwork = itaup + *m; + ie = 1; + nrwork = ie + *m; + +/* + Bidiagonalize A. + (RWorkspace: need M) + (CWorkspace: need 2*M+N, prefer 2*M+(M+N)*NB) +*/ + + i__1 = *lwork - nwork + 1; + cgebrd_(m, n, &a[a_offset], lda, &s[1], &rwork[ie], &work[itauq], + &work[itaup], &work[nwork], &i__1, info); + +/* + Multiply B by transpose of left bidiagonalizing vectors. + (CWorkspace: need 2*M+NRHS, prefer 2*M+NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + cunmbr_("Q", "L", "C", m, nrhs, n, &a[a_offset], lda, &work[itauq] + , &b[b_offset], ldb, &work[nwork], &i__1, info); + +/* Solve the bidiagonal least squares problem. */ + + clalsd_("L", &smlsiz, m, nrhs, &s[1], &rwork[ie], &b[b_offset], + ldb, rcond, rank, &work[nwork], &rwork[nrwork], &iwork[1], + info); + if (*info != 0) { + goto L10; + } + +/* Multiply B by right bidiagonalizing vectors of A. */ + + i__1 = *lwork - nwork + 1; + cunmbr_("P", "L", "N", n, nrhs, m, &a[a_offset], lda, &work[itaup] + , &b[b_offset], ldb, &work[nwork], &i__1, info); + + } + } + +/* Undo scaling. */ + + if (iascl == 1) { + clascl_("G", &c__0, &c__0, &anrm, &smlnum, n, nrhs, &b[b_offset], ldb, + info); + slascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], & + minmn, info); + } else if (iascl == 2) { + clascl_("G", &c__0, &c__0, &anrm, &bignum, n, nrhs, &b[b_offset], ldb, + info); + slascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], & + minmn, info); + } + if (ibscl == 1) { + clascl_("G", &c__0, &c__0, &smlnum, &bnrm, n, nrhs, &b[b_offset], ldb, + info); + } else if (ibscl == 2) { + clascl_("G", &c__0, &c__0, &bignum, &bnrm, n, nrhs, &b[b_offset], ldb, + info); + } + +L10: + work[1].r = (real) maxwrk, work[1].i = 0.f; + iwork[1] = liwork; + rwork[1] = (real) lrwork; + return 0; + +/* End of CGELSD */ + +} /* cgelsd_ */ + +/* Subroutine */ int cgeqr2_(integer *m, integer *n, complex *a, integer *lda, + complex *tau, complex *work, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3; + complex q__1; + + /* Local variables */ + static integer i__, k; + static complex alpha; + extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex * + , integer *, complex *, complex *, integer *, complex *), + clarfg_(integer *, complex *, complex *, integer *, complex *), + xerbla_(char *, integer *); + + +/* + -- LAPACK routine (version 3.2.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + June 2010 + + + Purpose + ======= + + CGEQR2 computes a QR factorization of a complex m by n matrix A: A = Q * R. Arguments @@ -2866,13 +3455,12 @@ static real c_b2023 = .5f; The number of columns of the matrix A. N >= 0. A (input/output) COMPLEX array, dimension (LDA,N) - On entry, the M-by-N matrix A. + On entry, the m by n matrix A. On exit, the elements on and above the diagonal of the array - contain the min(M,N)-by-N upper trapezoidal matrix R (R is + contain the min(m,n) by n upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the unitary matrix Q as a - product of min(m,n) elementary reflectors (see Further - Details). + product of elementary reflectors (see Further Details). LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). @@ -2881,22 +3469,11 @@ static real c_b2023 = .5f; The scalar factors of the elementary reflectors (see Further Details). - WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) - On exit, if INFO = 0, WORK(1) returns the optimal LWORK. - - LWORK (input) INTEGER - The dimension of the array WORK. LWORK >= max(1,N). - For optimum performance LWORK >= N*NB, where NB is - the optimal blocksize. - - If LWORK = -1, then a workspace query is assumed; the routine - only calculates the optimal size of the WORK array, returns - this value as the first entry of the WORK array, and no error - message related to LWORK is issued by XERBLA. + WORK (workspace) COMPLEX array, dimension (N) INFO (output) INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value + = 0: successful exit + < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== @@ -2928,63 +3505,218 @@ static real c_b2023 = .5f; /* Function Body */ *info = 0; - nb = ilaenv_(&c__1, "CGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen) - 1); - lwkopt = *n * nb; - work[1].r = (real) lwkopt, work[1].i = 0.f; - lquery = *lwork == -1; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < max(1,*m)) { *info = -4; - } else if (*lwork < max(1,*n) && ! lquery) { - *info = -7; } if (*info != 0) { i__1 = -(*info); - xerbla_("CGEQRF", &i__1); - return 0; - } else if (lquery) { + xerbla_("CGEQR2", &i__1); return 0; } -/* Quick return if possible */ - k = min(*m,*n); - if (k == 0) { - work[1].r = 1.f, work[1].i = 0.f; - return 0; - } - - nbmin = 2; - nx = 0; - iws = *n; - if (nb > 1 && nb < k) { -/* - Determine when to cross over from blocked to unblocked code. - - Computing MAX -*/ - i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQRF", " ", m, n, &c_n1, &c_n1, ( - ftnlen)6, (ftnlen)1); - nx = max(i__1,i__2); - if (nx < k) { + i__1 = k; + for (i__ = 1; i__ <= i__1; ++i__) { -/* Determine if workspace is large enough for blocked code. */ +/* Generate elementary reflector H(i) to annihilate A(i+1:m,i) */ - ldwork = *n; - iws = ldwork * nb; - if (*lwork < iws) { + i__2 = *m - i__ + 1; +/* Computing MIN */ + i__3 = i__ + 1; + clarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1] + , &c__1, &tau[i__]); + if (i__ < *n) { -/* - Not enough workspace to use optimal NB: reduce NB and - determine the minimum value of NB. -*/ +/* Apply H(i)' to A(i:m,i+1:n) from the left */ - nb = *lwork / ldwork; + i__2 = i__ + i__ * a_dim1; + alpha.r = a[i__2].r, alpha.i = a[i__2].i; + i__2 = i__ + i__ * a_dim1; + a[i__2].r = 1.f, a[i__2].i = 0.f; + i__2 = *m - i__ + 1; + i__3 = *n - i__; + r_cnjg(&q__1, &tau[i__]); + clarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &q__1, + &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]); + i__2 = i__ + i__ * a_dim1; + a[i__2].r = alpha.r, a[i__2].i = alpha.i; + } +/* L10: */ + } + return 0; + +/* End of CGEQR2 */ + +} /* cgeqr2_ */ + +/* Subroutine */ int cgeqrf_(integer *m, integer *n, complex *a, integer *lda, + complex *tau, complex *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + static integer i__, k, ib, nb, nx, iws, nbmin, iinfo; + extern /* Subroutine */ int cgeqr2_(integer *, integer *, complex *, + integer *, complex *, complex *, integer *), clarfb_(char *, char + *, char *, char *, integer *, integer *, integer *, complex *, + integer *, complex *, integer *, complex *, integer *, complex *, + integer *), clarft_(char *, char * + , integer *, integer *, complex *, integer *, complex *, complex * + , integer *), xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + static integer ldwork, lwkopt; + static logical lquery; + + +/* + -- LAPACK routine (version 3.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + November 2006 + + + Purpose + ======= + + CGEQRF computes a QR factorization of a complex M-by-N matrix A: + A = Q * R. + + Arguments + ========= + + M (input) INTEGER + The number of rows of the matrix A. M >= 0. + + N (input) INTEGER + The number of columns of the matrix A. N >= 0. + + A (input/output) COMPLEX array, dimension (LDA,N) + On entry, the M-by-N matrix A. + On exit, the elements on and above the diagonal of the array + contain the min(M,N)-by-N upper trapezoidal matrix R (R is + upper triangular if m >= n); the elements below the diagonal, + with the array TAU, represent the unitary matrix Q as a + product of min(m,n) elementary reflectors (see Further + Details). + + LDA (input) INTEGER + The leading dimension of the array A. LDA >= max(1,M). + + TAU (output) COMPLEX array, dimension (min(M,N)) + The scalar factors of the elementary reflectors (see Further + Details). + + WORK (workspace/output) COMPLEX array, dimension (MAX(1,LWORK)) + On exit, if INFO = 0, WORK(1) returns the optimal LWORK. + + LWORK (input) INTEGER + The dimension of the array WORK. LWORK >= max(1,N). + For optimum performance LWORK >= N*NB, where NB is + the optimal blocksize. + + If LWORK = -1, then a workspace query is assumed; the routine + only calculates the optimal size of the WORK array, returns + this value as the first entry of the WORK array, and no error + message related to LWORK is issued by XERBLA. + + INFO (output) INTEGER + = 0: successful exit + < 0: if INFO = -i, the i-th argument had an illegal value + + Further Details + =============== + + The matrix Q is represented as a product of elementary reflectors + + Q = H(1) H(2) . . . H(k), where k = min(m,n). + + Each H(i) has the form + + H(i) = I - tau * v * v' + + where tau is a complex scalar, and v is a complex vector with + v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), + and tau in TAU(i). + + ===================================================================== + + + Test the input arguments +*/ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --tau; + --work; + + /* Function Body */ + *info = 0; + nb = ilaenv_(&c__1, "CGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen) + 1); + lwkopt = *n * nb; + work[1].r = (real) lwkopt, work[1].i = 0.f; + lquery = *lwork == -1; + if (*m < 0) { + *info = -1; + } else if (*n < 0) { + *info = -2; + } else if (*lda < max(1,*m)) { + *info = -4; + } else if (*lwork < max(1,*n) && ! lquery) { + *info = -7; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CGEQRF", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + k = min(*m,*n); + if (k == 0) { + work[1].r = 1.f, work[1].i = 0.f; + return 0; + } + + nbmin = 2; + nx = 0; + iws = *n; + if (nb > 1 && nb < k) { + +/* + Determine when to cross over from blocked to unblocked code. + + Computing MAX +*/ + i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQRF", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nx = max(i__1,i__2); + if (nx < k) { + +/* Determine if workspace is large enough for blocked code. */ + + ldwork = *n; + iws = ldwork * nb; + if (*lwork < iws) { + +/* + Not enough workspace to use optimal NB: reduce NB and + determine the minimum value of NB. +*/ + + nb = *lwork / ldwork; /* Computing MAX */ i__1 = 2, i__2 = ilaenv_(&c__2, "CGEQRF", " ", m, n, &c_n1, & c_n1, (ftnlen)6, (ftnlen)1); @@ -6471,8 +7203,8 @@ static real c_b2023 = .5f; sigma = rmax / anrm; } if (iscale == 1) { - clascl_(uplo, &c__0, &c__0, &c_b894, &sigma, n, n, &a[a_offset], lda, - info); + clascl_(uplo, &c__0, &c__0, &c_b1034, &sigma, n, n, &a[a_offset], lda, + info); } /* Call CHETRD to reduce Hermitian matrix to tridiagonal form. */ @@ -7135,7 +7867,7 @@ static real c_b2023 = .5f; i__3 = i__ - 1; q__1.r = -1.f, q__1.i = -0.f; cher2k_(uplo, "No transpose", &i__3, &nb, &q__1, &a[i__ * a_dim1 - + 1], lda, &work[1], &ldwork, &c_b894, &a[a_offset], lda); + + 1], lda, &work[1], &ldwork, &c_b1034, &a[a_offset], lda); /* Copy superdiagonal elements back into A, and diagonal @@ -7184,7 +7916,7 @@ static real c_b2023 = .5f; i__3 = *n - i__ - nb + 1; q__1.r = -1.f, q__1.i = -0.f; cher2k_(uplo, "No transpose", &i__3, &nb, &q__1, &a[i__ + nb + - i__ * a_dim1], lda, &work[nb + 1], &ldwork, &c_b894, &a[ + i__ * a_dim1], lda, &work[nb + 1], &ldwork, &c_b1034, &a[ i__ + nb + (i__ + nb) * a_dim1], lda); /* @@ -8537,8 +9269,8 @@ static real c_b2023 = .5f; } l = *m * *n + 1; - sgemm_("N", "N", m, n, n, &c_b894, &rwork[1], m, &b[b_offset], ldb, & - c_b1087, &rwork[l], m); + sgemm_("N", "N", m, n, n, &c_b1034, &rwork[1], m, &b[b_offset], ldb, & + c_b328, &rwork[l], m); i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = *m; @@ -8560,8 +9292,8 @@ static real c_b2023 = .5f; } /* L60: */ } - sgemm_("N", "N", m, n, n, &c_b894, &rwork[1], m, &b[b_offset], ldb, & - c_b1087, &rwork[l], m); + sgemm_("N", "N", m, n, n, &c_b1034, &rwork[1], m, &b[b_offset], ldb, & + c_b328, &rwork[l], m); i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = *m; @@ -9495,7 +10227,7 @@ static real c_b2023 = .5f; n1p1 = n1 + 1; if (*rho < 0.f) { - sscal_(&n2, &c_b1136, &z__[n1p1], &c__1); + sscal_(&n2, &c_b1276, &z__[n1p1], &c__1); } /* Normalize z so that norm(z) = 1 */ @@ -10746,6 +11478,1932 @@ static real c_b2023 = .5f; } /* clahr2_ */ +/* Subroutine */ int clals0_(integer *icompq, integer *nl, integer *nr, + integer *sqre, integer *nrhs, complex *b, integer *ldb, complex *bx, + integer *ldbx, integer *perm, integer *givptr, integer *givcol, + integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real * + difl, real *difr, real *z__, integer *k, real *c__, real *s, real * + rwork, integer *info) +{ + /* System generated locals */ + integer givcol_dim1, givcol_offset, difr_dim1, difr_offset, givnum_dim1, + givnum_offset, poles_dim1, poles_offset, b_dim1, b_offset, + bx_dim1, bx_offset, i__1, i__2, i__3, i__4, i__5; + real r__1; + complex q__1; + + /* Local variables */ + static integer i__, j, m, n; + static real dj; + static integer nlp1, jcol; + static real temp; + static integer jrow; + extern doublereal snrm2_(integer *, real *, integer *); + static real diflj, difrj, dsigj; + extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + complex *, integer *), sgemv_(char *, integer *, integer *, real * + , real *, integer *, real *, integer *, real *, real *, integer *), csrot_(integer *, complex *, integer *, complex *, + integer *, real *, real *); + extern doublereal slamc3_(real *, real *); + extern /* Subroutine */ int clascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, complex *, integer *, integer *), csscal_(integer *, real *, complex *, integer *), + clacpy_(char *, integer *, integer *, complex *, integer *, + complex *, integer *), xerbla_(char *, integer *); + static real dsigjp; + + +/* + -- LAPACK routine (version 3.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + November 2006 + + + Purpose + ======= + + CLALS0 applies back the multiplying factors of either the left or the + right singular vector matrix of a diagonal matrix appended by a row + to the right hand side matrix B in solving the least squares problem + using the divide-and-conquer SVD approach. + + For the left singular vector matrix, three types of orthogonal + matrices are involved: + + (1L) Givens rotations: the number of such rotations is GIVPTR; the + pairs of columns/rows they were applied to are stored in GIVCOL; + and the C- and S-values of these rotations are stored in GIVNUM. + + (2L) Permutation. The (NL+1)-st row of B is to be moved to the first + row, and for J=2:N, PERM(J)-th row of B is to be moved to the + J-th row. + + (3L) The left singular vector matrix of the remaining matrix. + + For the right singular vector matrix, four types of orthogonal + matrices are involved: + + (1R) The right singular vector matrix of the remaining matrix. + + (2R) If SQRE = 1, one extra Givens rotation to generate the right + null space. + + (3R) The inverse transformation of (2L). + + (4R) The inverse transformation of (1L). + + Arguments + ========= + + ICOMPQ (input) INTEGER + Specifies whether singular vectors are to be computed in + factored form: + = 0: Left singular vector matrix. + = 1: Right singular vector matrix. + + NL (input) INTEGER + The row dimension of the upper block. NL >= 1. + + NR (input) INTEGER + The row dimension of the lower block. NR >= 1. + + SQRE (input) INTEGER + = 0: the lower block is an NR-by-NR square matrix. + = 1: the lower block is an NR-by-(NR+1) rectangular matrix. + + The bidiagonal matrix has row dimension N = NL + NR + 1, + and column dimension M = N + SQRE. + + NRHS (input) INTEGER + The number of columns of B and BX. NRHS must be at least 1. + + B (input/output) COMPLEX array, dimension ( LDB, NRHS ) + On input, B contains the right hand sides of the least + squares problem in rows 1 through M. On output, B contains + the solution X in rows 1 through N. + + LDB (input) INTEGER + The leading dimension of B. LDB must be at least + max(1,MAX( M, N ) ). + + BX (workspace) COMPLEX array, dimension ( LDBX, NRHS ) + + LDBX (input) INTEGER + The leading dimension of BX. + + PERM (input) INTEGER array, dimension ( N ) + The permutations (from deflation and sorting) applied + to the two blocks. + + GIVPTR (input) INTEGER + The number of Givens rotations which took place in this + subproblem. + + GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 ) + Each pair of numbers indicates a pair of rows/columns + involved in a Givens rotation. + + LDGCOL (input) INTEGER + The leading dimension of GIVCOL, must be at least N. + + GIVNUM (input) REAL array, dimension ( LDGNUM, 2 ) + Each number indicates the C or S value used in the + corresponding Givens rotation. + + LDGNUM (input) INTEGER + The leading dimension of arrays DIFR, POLES and + GIVNUM, must be at least K. + + POLES (input) REAL array, dimension ( LDGNUM, 2 ) + On entry, POLES(1:K, 1) contains the new singular + values obtained from solving the secular equation, and + POLES(1:K, 2) is an array containing the poles in the secular + equation. + + DIFL (input) REAL array, dimension ( K ). + On entry, DIFL(I) is the distance between I-th updated + (undeflated) singular value and the I-th (undeflated) old + singular value. + + DIFR (input) REAL array, dimension ( LDGNUM, 2 ). + On entry, DIFR(I, 1) contains the distances between I-th + updated (undeflated) singular value and the I+1-th + (undeflated) old singular value. And DIFR(I, 2) is the + normalizing factor for the I-th right singular vector. + + Z (input) REAL array, dimension ( K ) + Contain the components of the deflation-adjusted updating row + vector. + + K (input) INTEGER + Contains the dimension of the non-deflated matrix, + This is the order of the related secular equation. 1 <= K <=N. + + C (input) REAL + C contains garbage if SQRE =0 and the C-value of a Givens + rotation related to the right null space if SQRE = 1. + + S (input) REAL + S contains garbage if SQRE =0 and the S-value of a Givens + rotation related to the right null space if SQRE = 1. + + RWORK (workspace) REAL array, dimension + ( K*(1+NRHS) + 2*NRHS ) + + INFO (output) INTEGER + = 0: successful exit. + < 0: if INFO = -i, the i-th argument had an illegal value. + + Further Details + =============== + + Based on contributions by + Ming Gu and Ren-Cang Li, Computer Science Division, University of + California at Berkeley, USA + Osni Marques, LBNL/NERSC, USA + + ===================================================================== + + + Test the input parameters. +*/ + + /* Parameter adjustments */ + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + bx_dim1 = *ldbx; + bx_offset = 1 + bx_dim1; + bx -= bx_offset; + --perm; + givcol_dim1 = *ldgcol; + givcol_offset = 1 + givcol_dim1; + givcol -= givcol_offset; + difr_dim1 = *ldgnum; + difr_offset = 1 + difr_dim1; + difr -= difr_offset; + poles_dim1 = *ldgnum; + poles_offset = 1 + poles_dim1; + poles -= poles_offset; + givnum_dim1 = *ldgnum; + givnum_offset = 1 + givnum_dim1; + givnum -= givnum_offset; + --difl; + --z__; + --rwork; + + /* Function Body */ + *info = 0; + + if (*icompq < 0 || *icompq > 1) { + *info = -1; + } else if (*nl < 1) { + *info = -2; + } else if (*nr < 1) { + *info = -3; + } else if (*sqre < 0 || *sqre > 1) { + *info = -4; + } + + n = *nl + *nr + 1; + + if (*nrhs < 1) { + *info = -5; + } else if (*ldb < n) { + *info = -7; + } else if (*ldbx < n) { + *info = -9; + } else if (*givptr < 0) { + *info = -11; + } else if (*ldgcol < n) { + *info = -13; + } else if (*ldgnum < n) { + *info = -15; + } else if (*k < 1) { + *info = -20; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CLALS0", &i__1); + return 0; + } + + m = n + *sqre; + nlp1 = *nl + 1; + + if (*icompq == 0) { + +/* + Apply back orthogonal transformations from the left. + + Step (1L): apply back the Givens rotations performed. +*/ + + i__1 = *givptr; + for (i__ = 1; i__ <= i__1; ++i__) { + csrot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, & + b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ + + (givnum_dim1 << 1)], &givnum[i__ + givnum_dim1]); +/* L10: */ + } + +/* Step (2L): permute rows of B. */ + + ccopy_(nrhs, &b[nlp1 + b_dim1], ldb, &bx[bx_dim1 + 1], ldbx); + i__1 = n; + for (i__ = 2; i__ <= i__1; ++i__) { + ccopy_(nrhs, &b[perm[i__] + b_dim1], ldb, &bx[i__ + bx_dim1], + ldbx); +/* L20: */ + } + +/* + Step (3L): apply the inverse of the left singular vector + matrix to BX. +*/ + + if (*k == 1) { + ccopy_(nrhs, &bx[bx_offset], ldbx, &b[b_offset], ldb); + if (z__[1] < 0.f) { + csscal_(nrhs, &c_b1276, &b[b_offset], ldb); + } + } else { + i__1 = *k; + for (j = 1; j <= i__1; ++j) { + diflj = difl[j]; + dj = poles[j + poles_dim1]; + dsigj = -poles[j + (poles_dim1 << 1)]; + if (j < *k) { + difrj = -difr[j + difr_dim1]; + dsigjp = -poles[j + 1 + (poles_dim1 << 1)]; + } + if (z__[j] == 0.f || poles[j + (poles_dim1 << 1)] == 0.f) { + rwork[j] = 0.f; + } else { + rwork[j] = -poles[j + (poles_dim1 << 1)] * z__[j] / diflj + / (poles[j + (poles_dim1 << 1)] + dj); + } + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] == + 0.f) { + rwork[i__] = 0.f; + } else { + rwork[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__] + / (slamc3_(&poles[i__ + (poles_dim1 << 1)], & + dsigj) - diflj) / (poles[i__ + (poles_dim1 << + 1)] + dj); + } +/* L30: */ + } + i__2 = *k; + for (i__ = j + 1; i__ <= i__2; ++i__) { + if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] == + 0.f) { + rwork[i__] = 0.f; + } else { + rwork[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__] + / (slamc3_(&poles[i__ + (poles_dim1 << 1)], & + dsigjp) + difrj) / (poles[i__ + (poles_dim1 << + 1)] + dj); + } +/* L40: */ + } + rwork[1] = -1.f; + temp = snrm2_(k, &rwork[1], &c__1); + +/* + Since B and BX are complex, the following call to SGEMV + is performed in two steps (real and imaginary parts). + + CALL SGEMV( 'T', K, NRHS, ONE, BX, LDBX, WORK, 1, ZERO, + $ B( J, 1 ), LDB ) +*/ + + i__ = *k + (*nrhs << 1); + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = *k; + for (jrow = 1; jrow <= i__3; ++jrow) { + ++i__; + i__4 = jrow + jcol * bx_dim1; + rwork[i__] = bx[i__4].r; +/* L50: */ + } +/* L60: */ + } + sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)], + k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1], &c__1); + i__ = *k + (*nrhs << 1); + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = *k; + for (jrow = 1; jrow <= i__3; ++jrow) { + ++i__; + rwork[i__] = r_imag(&bx[jrow + jcol * bx_dim1]); +/* L70: */ + } +/* L80: */ + } + sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)], + k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1 + *nrhs], + &c__1); + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = j + jcol * b_dim1; + i__4 = jcol + *k; + i__5 = jcol + *k + *nrhs; + q__1.r = rwork[i__4], q__1.i = rwork[i__5]; + b[i__3].r = q__1.r, b[i__3].i = q__1.i; +/* L90: */ + } + clascl_("G", &c__0, &c__0, &temp, &c_b1034, &c__1, nrhs, &b[j + + b_dim1], ldb, info); +/* L100: */ + } + } + +/* Move the deflated rows of BX to B also. */ + + if (*k < max(m,n)) { + i__1 = n - *k; + clacpy_("A", &i__1, nrhs, &bx[*k + 1 + bx_dim1], ldbx, &b[*k + 1 + + b_dim1], ldb); + } + } else { + +/* + Apply back the right orthogonal transformations. + + Step (1R): apply back the new right singular vector matrix + to B. +*/ + + if (*k == 1) { + ccopy_(nrhs, &b[b_offset], ldb, &bx[bx_offset], ldbx); + } else { + i__1 = *k; + for (j = 1; j <= i__1; ++j) { + dsigj = poles[j + (poles_dim1 << 1)]; + if (z__[j] == 0.f) { + rwork[j] = 0.f; + } else { + rwork[j] = -z__[j] / difl[j] / (dsigj + poles[j + + poles_dim1]) / difr[j + (difr_dim1 << 1)]; + } + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + if (z__[j] == 0.f) { + rwork[i__] = 0.f; + } else { + r__1 = -poles[i__ + 1 + (poles_dim1 << 1)]; + rwork[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difr[ + i__ + difr_dim1]) / (dsigj + poles[i__ + + poles_dim1]) / difr[i__ + (difr_dim1 << 1)]; + } +/* L110: */ + } + i__2 = *k; + for (i__ = j + 1; i__ <= i__2; ++i__) { + if (z__[j] == 0.f) { + rwork[i__] = 0.f; + } else { + r__1 = -poles[i__ + (poles_dim1 << 1)]; + rwork[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difl[ + i__]) / (dsigj + poles[i__ + poles_dim1]) / + difr[i__ + (difr_dim1 << 1)]; + } +/* L120: */ + } + +/* + Since B and BX are complex, the following call to SGEMV + is performed in two steps (real and imaginary parts). + + CALL SGEMV( 'T', K, NRHS, ONE, B, LDB, WORK, 1, ZERO, + $ BX( J, 1 ), LDBX ) +*/ + + i__ = *k + (*nrhs << 1); + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = *k; + for (jrow = 1; jrow <= i__3; ++jrow) { + ++i__; + i__4 = jrow + jcol * b_dim1; + rwork[i__] = b[i__4].r; +/* L130: */ + } +/* L140: */ + } + sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)], + k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1], &c__1); + i__ = *k + (*nrhs << 1); + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = *k; + for (jrow = 1; jrow <= i__3; ++jrow) { + ++i__; + rwork[i__] = r_imag(&b[jrow + jcol * b_dim1]); +/* L150: */ + } +/* L160: */ + } + sgemv_("T", k, nrhs, &c_b1034, &rwork[*k + 1 + (*nrhs << 1)], + k, &rwork[1], &c__1, &c_b328, &rwork[*k + 1 + *nrhs], + &c__1); + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = j + jcol * bx_dim1; + i__4 = jcol + *k; + i__5 = jcol + *k + *nrhs; + q__1.r = rwork[i__4], q__1.i = rwork[i__5]; + bx[i__3].r = q__1.r, bx[i__3].i = q__1.i; +/* L170: */ + } +/* L180: */ + } + } + +/* + Step (2R): if SQRE = 1, apply back the rotation that is + related to the right null space of the subproblem. +*/ + + if (*sqre == 1) { + ccopy_(nrhs, &b[m + b_dim1], ldb, &bx[m + bx_dim1], ldbx); + csrot_(nrhs, &bx[bx_dim1 + 1], ldbx, &bx[m + bx_dim1], ldbx, c__, + s); + } + if (*k < max(m,n)) { + i__1 = n - *k; + clacpy_("A", &i__1, nrhs, &b[*k + 1 + b_dim1], ldb, &bx[*k + 1 + + bx_dim1], ldbx); + } + +/* Step (3R): permute rows of B. */ + + ccopy_(nrhs, &bx[bx_dim1 + 1], ldbx, &b[nlp1 + b_dim1], ldb); + if (*sqre == 1) { + ccopy_(nrhs, &bx[m + bx_dim1], ldbx, &b[m + b_dim1], ldb); + } + i__1 = n; + for (i__ = 2; i__ <= i__1; ++i__) { + ccopy_(nrhs, &bx[i__ + bx_dim1], ldbx, &b[perm[i__] + b_dim1], + ldb); +/* L190: */ + } + +/* Step (4R): apply back the Givens rotations performed. */ + + for (i__ = *givptr; i__ >= 1; --i__) { + r__1 = -givnum[i__ + givnum_dim1]; + csrot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, & + b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ + + (givnum_dim1 << 1)], &r__1); +/* L200: */ + } + } + + return 0; + +/* End of CLALS0 */ + +} /* clals0_ */ + +/* Subroutine */ int clalsa_(integer *icompq, integer *smlsiz, integer *n, + integer *nrhs, complex *b, integer *ldb, complex *bx, integer *ldbx, + real *u, integer *ldu, real *vt, integer *k, real *difl, real *difr, + real *z__, real *poles, integer *givptr, integer *givcol, integer * + ldgcol, integer *perm, real *givnum, real *c__, real *s, real *rwork, + integer *iwork, integer *info) +{ + /* System generated locals */ + integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, difl_dim1, + difl_offset, difr_dim1, difr_offset, givnum_dim1, givnum_offset, + poles_dim1, poles_offset, u_dim1, u_offset, vt_dim1, vt_offset, + z_dim1, z_offset, b_dim1, b_offset, bx_dim1, bx_offset, i__1, + i__2, i__3, i__4, i__5, i__6; + complex q__1; + + /* Local variables */ + static integer i__, j, i1, ic, lf, nd, ll, nl, nr, im1, nlf, nrf, lvl, + ndb1, nlp1, lvl2, nrp1, jcol, nlvl, sqre, jrow, jimag, jreal, + inode, ndiml; + extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *); + static integer ndimr; + extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + complex *, integer *), clals0_(integer *, integer *, integer *, + integer *, integer *, complex *, integer *, complex *, integer *, + integer *, integer *, integer *, integer *, real *, integer *, + real *, real *, real *, real *, integer *, real *, real *, real *, + integer *), xerbla_(char *, integer *), slasdt_(integer * + , integer *, integer *, integer *, integer *, integer *, integer * + ); + + +/* + -- LAPACK routine (version 3.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + November 2006 + + + Purpose + ======= + + CLALSA is an itermediate step in solving the least squares problem + by computing the SVD of the coefficient matrix in compact form (The + singular vectors are computed as products of simple orthorgonal + matrices.). + + If ICOMPQ = 0, CLALSA applies the inverse of the left singular vector + matrix of an upper bidiagonal matrix to the right hand side; and if + ICOMPQ = 1, CLALSA applies the right singular vector matrix to the + right hand side. The singular vector matrices were generated in + compact form by CLALSA. + + Arguments + ========= + + ICOMPQ (input) INTEGER + Specifies whether the left or the right singular vector + matrix is involved. + = 0: Left singular vector matrix + = 1: Right singular vector matrix + + SMLSIZ (input) INTEGER + The maximum size of the subproblems at the bottom of the + computation tree. + + N (input) INTEGER + The row and column dimensions of the upper bidiagonal matrix. + + NRHS (input) INTEGER + The number of columns of B and BX. NRHS must be at least 1. + + B (input/output) COMPLEX array, dimension ( LDB, NRHS ) + On input, B contains the right hand sides of the least + squares problem in rows 1 through M. + On output, B contains the solution X in rows 1 through N. + + LDB (input) INTEGER + The leading dimension of B in the calling subprogram. + LDB must be at least max(1,MAX( M, N ) ). + + BX (output) COMPLEX array, dimension ( LDBX, NRHS ) + On exit, the result of applying the left or right singular + vector matrix to B. + + LDBX (input) INTEGER + The leading dimension of BX. + + U (input) REAL array, dimension ( LDU, SMLSIZ ). + On entry, U contains the left singular vector matrices of all + subproblems at the bottom level. + + LDU (input) INTEGER, LDU = > N. + The leading dimension of arrays U, VT, DIFL, DIFR, + POLES, GIVNUM, and Z. + + VT (input) REAL array, dimension ( LDU, SMLSIZ+1 ). + On entry, VT' contains the right singular vector matrices of + all subproblems at the bottom level. + + K (input) INTEGER array, dimension ( N ). + + DIFL (input) REAL array, dimension ( LDU, NLVL ). + where NLVL = INT(log_2 (N/(SMLSIZ+1))) + 1. + + DIFR (input) REAL array, dimension ( LDU, 2 * NLVL ). + On entry, DIFL(*, I) and DIFR(*, 2 * I -1) record + distances between singular values on the I-th level and + singular values on the (I -1)-th level, and DIFR(*, 2 * I) + record the normalizing factors of the right singular vectors + matrices of subproblems on I-th level. + + Z (input) REAL array, dimension ( LDU, NLVL ). + On entry, Z(1, I) contains the components of the deflation- + adjusted updating row vector for subproblems on the I-th + level. + + POLES (input) REAL array, dimension ( LDU, 2 * NLVL ). + On entry, POLES(*, 2 * I -1: 2 * I) contains the new and old + singular values involved in the secular equations on the I-th + level. + + GIVPTR (input) INTEGER array, dimension ( N ). + On entry, GIVPTR( I ) records the number of Givens + rotations performed on the I-th problem on the computation + tree. + + GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 * NLVL ). + On entry, for each I, GIVCOL(*, 2 * I - 1: 2 * I) records the + locations of Givens rotations performed on the I-th level on + the computation tree. + + LDGCOL (input) INTEGER, LDGCOL = > N. + The leading dimension of arrays GIVCOL and PERM. + + PERM (input) INTEGER array, dimension ( LDGCOL, NLVL ). + On entry, PERM(*, I) records permutations done on the I-th + level of the computation tree. + + GIVNUM (input) REAL array, dimension ( LDU, 2 * NLVL ). + On entry, GIVNUM(*, 2 *I -1 : 2 * I) records the C- and S- + values of Givens rotations performed on the I-th level on the + computation tree. + + C (input) REAL array, dimension ( N ). + On entry, if the I-th subproblem is not square, + C( I ) contains the C-value of a Givens rotation related to + the right null space of the I-th subproblem. + + S (input) REAL array, dimension ( N ). + On entry, if the I-th subproblem is not square, + S( I ) contains the S-value of a Givens rotation related to + the right null space of the I-th subproblem. + + RWORK (workspace) REAL array, dimension at least + MAX( (SMLSZ+1)*NRHS*3, N*(1+NRHS) + 2*NRHS ). + + IWORK (workspace) INTEGER array. + The dimension must be at least 3 * N + + INFO (output) INTEGER + = 0: successful exit. + < 0: if INFO = -i, the i-th argument had an illegal value. + + Further Details + =============== + + Based on contributions by + Ming Gu and Ren-Cang Li, Computer Science Division, University of + California at Berkeley, USA + Osni Marques, LBNL/NERSC, USA + + ===================================================================== + + + Test the input parameters. +*/ + + /* Parameter adjustments */ + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + bx_dim1 = *ldbx; + bx_offset = 1 + bx_dim1; + bx -= bx_offset; + givnum_dim1 = *ldu; + givnum_offset = 1 + givnum_dim1; + givnum -= givnum_offset; + poles_dim1 = *ldu; + poles_offset = 1 + poles_dim1; + poles -= poles_offset; + z_dim1 = *ldu; + z_offset = 1 + z_dim1; + z__ -= z_offset; + difr_dim1 = *ldu; + difr_offset = 1 + difr_dim1; + difr -= difr_offset; + difl_dim1 = *ldu; + difl_offset = 1 + difl_dim1; + difl -= difl_offset; + vt_dim1 = *ldu; + vt_offset = 1 + vt_dim1; + vt -= vt_offset; + u_dim1 = *ldu; + u_offset = 1 + u_dim1; + u -= u_offset; + --k; + --givptr; + perm_dim1 = *ldgcol; + perm_offset = 1 + perm_dim1; + perm -= perm_offset; + givcol_dim1 = *ldgcol; + givcol_offset = 1 + givcol_dim1; + givcol -= givcol_offset; + --c__; + --s; + --rwork; + --iwork; + + /* Function Body */ + *info = 0; + + if (*icompq < 0 || *icompq > 1) { + *info = -1; + } else if (*smlsiz < 3) { + *info = -2; + } else if (*n < *smlsiz) { + *info = -3; + } else if (*nrhs < 1) { + *info = -4; + } else if (*ldb < *n) { + *info = -6; + } else if (*ldbx < *n) { + *info = -8; + } else if (*ldu < *n) { + *info = -10; + } else if (*ldgcol < *n) { + *info = -19; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CLALSA", &i__1); + return 0; + } + +/* Book-keeping and setting up the computation tree. */ + + inode = 1; + ndiml = inode + *n; + ndimr = ndiml + *n; + + slasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr], + smlsiz); + +/* + The following code applies back the left singular vector factors. + For applying back the right singular vector factors, go to 170. +*/ + + if (*icompq == 1) { + goto L170; + } + +/* + The nodes on the bottom level of the tree were solved + by SLASDQ. The corresponding left and right singular vector + matrices are in explicit form. First apply back the left + singular vector matrices. +*/ + + ndb1 = (nd + 1) / 2; + i__1 = nd; + for (i__ = ndb1; i__ <= i__1; ++i__) { + +/* + IC : center row of each node + NL : number of rows of left subproblem + NR : number of rows of right subproblem + NLF: starting row of the left subproblem + NRF: starting row of the right subproblem +*/ + + i1 = i__ - 1; + ic = iwork[inode + i1]; + nl = iwork[ndiml + i1]; + nr = iwork[ndimr + i1]; + nlf = ic - nl; + nrf = ic + 1; + +/* + Since B and BX are complex, the following call to SGEMM + is performed in two steps (real and imaginary parts). + + CALL SGEMM( 'T', 'N', NL, NRHS, NL, ONE, U( NLF, 1 ), LDU, + $ B( NLF, 1 ), LDB, ZERO, BX( NLF, 1 ), LDBX ) +*/ + + j = nl * *nrhs << 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nlf + nl - 1; + for (jrow = nlf; jrow <= i__3; ++jrow) { + ++j; + i__4 = jrow + jcol * b_dim1; + rwork[j] = b[i__4].r; +/* L10: */ + } +/* L20: */ + } + sgemm_("T", "N", &nl, nrhs, &nl, &c_b1034, &u[nlf + u_dim1], ldu, & + rwork[(nl * *nrhs << 1) + 1], &nl, &c_b328, &rwork[1], &nl); + j = nl * *nrhs << 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nlf + nl - 1; + for (jrow = nlf; jrow <= i__3; ++jrow) { + ++j; + rwork[j] = r_imag(&b[jrow + jcol * b_dim1]); +/* L30: */ + } +/* L40: */ + } + sgemm_("T", "N", &nl, nrhs, &nl, &c_b1034, &u[nlf + u_dim1], ldu, & + rwork[(nl * *nrhs << 1) + 1], &nl, &c_b328, &rwork[nl * *nrhs + + 1], &nl); + jreal = 0; + jimag = nl * *nrhs; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nlf + nl - 1; + for (jrow = nlf; jrow <= i__3; ++jrow) { + ++jreal; + ++jimag; + i__4 = jrow + jcol * bx_dim1; + i__5 = jreal; + i__6 = jimag; + q__1.r = rwork[i__5], q__1.i = rwork[i__6]; + bx[i__4].r = q__1.r, bx[i__4].i = q__1.i; +/* L50: */ + } +/* L60: */ + } + +/* + Since B and BX are complex, the following call to SGEMM + is performed in two steps (real and imaginary parts). + + CALL SGEMM( 'T', 'N', NR, NRHS, NR, ONE, U( NRF, 1 ), LDU, + $ B( NRF, 1 ), LDB, ZERO, BX( NRF, 1 ), LDBX ) +*/ + + j = nr * *nrhs << 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nrf + nr - 1; + for (jrow = nrf; jrow <= i__3; ++jrow) { + ++j; + i__4 = jrow + jcol * b_dim1; + rwork[j] = b[i__4].r; +/* L70: */ + } +/* L80: */ + } + sgemm_("T", "N", &nr, nrhs, &nr, &c_b1034, &u[nrf + u_dim1], ldu, & + rwork[(nr * *nrhs << 1) + 1], &nr, &c_b328, &rwork[1], &nr); + j = nr * *nrhs << 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nrf + nr - 1; + for (jrow = nrf; jrow <= i__3; ++jrow) { + ++j; + rwork[j] = r_imag(&b[jrow + jcol * b_dim1]); +/* L90: */ + } +/* L100: */ + } + sgemm_("T", "N", &nr, nrhs, &nr, &c_b1034, &u[nrf + u_dim1], ldu, & + rwork[(nr * *nrhs << 1) + 1], &nr, &c_b328, &rwork[nr * *nrhs + + 1], &nr); + jreal = 0; + jimag = nr * *nrhs; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nrf + nr - 1; + for (jrow = nrf; jrow <= i__3; ++jrow) { + ++jreal; + ++jimag; + i__4 = jrow + jcol * bx_dim1; + i__5 = jreal; + i__6 = jimag; + q__1.r = rwork[i__5], q__1.i = rwork[i__6]; + bx[i__4].r = q__1.r, bx[i__4].i = q__1.i; +/* L110: */ + } +/* L120: */ + } + +/* L130: */ + } + +/* + Next copy the rows of B that correspond to unchanged rows + in the bidiagonal matrix to BX. +*/ + + i__1 = nd; + for (i__ = 1; i__ <= i__1; ++i__) { + ic = iwork[inode + i__ - 1]; + ccopy_(nrhs, &b[ic + b_dim1], ldb, &bx[ic + bx_dim1], ldbx); +/* L140: */ + } + +/* + Finally go through the left singular vector matrices of all + the other subproblems bottom-up on the tree. +*/ + + j = pow_ii(&c__2, &nlvl); + sqre = 0; + + for (lvl = nlvl; lvl >= 1; --lvl) { + lvl2 = (lvl << 1) - 1; + +/* + find the first node LF and last node LL on + the current level LVL +*/ + + if (lvl == 1) { + lf = 1; + ll = 1; + } else { + i__1 = lvl - 1; + lf = pow_ii(&c__2, &i__1); + ll = (lf << 1) - 1; + } + i__1 = ll; + for (i__ = lf; i__ <= i__1; ++i__) { + im1 = i__ - 1; + ic = iwork[inode + im1]; + nl = iwork[ndiml + im1]; + nr = iwork[ndimr + im1]; + nlf = ic - nl; + nrf = ic + 1; + --j; + clals0_(icompq, &nl, &nr, &sqre, nrhs, &bx[nlf + bx_dim1], ldbx, & + b[nlf + b_dim1], ldb, &perm[nlf + lvl * perm_dim1], & + givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, & + givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 * + poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf + + lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[ + j], &s[j], &rwork[1], info); +/* L150: */ + } +/* L160: */ + } + goto L330; + +/* ICOMPQ = 1: applying back the right singular vector factors. */ + +L170: + +/* + First now go through the right singular vector matrices of all + the tree nodes top-down. +*/ + + j = 0; + i__1 = nlvl; + for (lvl = 1; lvl <= i__1; ++lvl) { + lvl2 = (lvl << 1) - 1; + +/* + Find the first node LF and last node LL on + the current level LVL. +*/ + + if (lvl == 1) { + lf = 1; + ll = 1; + } else { + i__2 = lvl - 1; + lf = pow_ii(&c__2, &i__2); + ll = (lf << 1) - 1; + } + i__2 = lf; + for (i__ = ll; i__ >= i__2; --i__) { + im1 = i__ - 1; + ic = iwork[inode + im1]; + nl = iwork[ndiml + im1]; + nr = iwork[ndimr + im1]; + nlf = ic - nl; + nrf = ic + 1; + if (i__ == ll) { + sqre = 0; + } else { + sqre = 1; + } + ++j; + clals0_(icompq, &nl, &nr, &sqre, nrhs, &b[nlf + b_dim1], ldb, &bx[ + nlf + bx_dim1], ldbx, &perm[nlf + lvl * perm_dim1], & + givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, & + givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 * + poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf + + lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[ + j], &s[j], &rwork[1], info); +/* L180: */ + } +/* L190: */ + } + +/* + The nodes on the bottom level of the tree were solved + by SLASDQ. The corresponding right singular vector + matrices are in explicit form. Apply them back. +*/ + + ndb1 = (nd + 1) / 2; + i__1 = nd; + for (i__ = ndb1; i__ <= i__1; ++i__) { + i1 = i__ - 1; + ic = iwork[inode + i1]; + nl = iwork[ndiml + i1]; + nr = iwork[ndimr + i1]; + nlp1 = nl + 1; + if (i__ == nd) { + nrp1 = nr; + } else { + nrp1 = nr + 1; + } + nlf = ic - nl; + nrf = ic + 1; + +/* + Since B and BX are complex, the following call to SGEMM is + performed in two steps (real and imaginary parts). + + CALL SGEMM( 'T', 'N', NLP1, NRHS, NLP1, ONE, VT( NLF, 1 ), LDU, + $ B( NLF, 1 ), LDB, ZERO, BX( NLF, 1 ), LDBX ) +*/ + + j = nlp1 * *nrhs << 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nlf + nlp1 - 1; + for (jrow = nlf; jrow <= i__3; ++jrow) { + ++j; + i__4 = jrow + jcol * b_dim1; + rwork[j] = b[i__4].r; +/* L200: */ + } +/* L210: */ + } + sgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b1034, &vt[nlf + vt_dim1], + ldu, &rwork[(nlp1 * *nrhs << 1) + 1], &nlp1, &c_b328, &rwork[ + 1], &nlp1); + j = nlp1 * *nrhs << 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nlf + nlp1 - 1; + for (jrow = nlf; jrow <= i__3; ++jrow) { + ++j; + rwork[j] = r_imag(&b[jrow + jcol * b_dim1]); +/* L220: */ + } +/* L230: */ + } + sgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b1034, &vt[nlf + vt_dim1], + ldu, &rwork[(nlp1 * *nrhs << 1) + 1], &nlp1, &c_b328, &rwork[ + nlp1 * *nrhs + 1], &nlp1); + jreal = 0; + jimag = nlp1 * *nrhs; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nlf + nlp1 - 1; + for (jrow = nlf; jrow <= i__3; ++jrow) { + ++jreal; + ++jimag; + i__4 = jrow + jcol * bx_dim1; + i__5 = jreal; + i__6 = jimag; + q__1.r = rwork[i__5], q__1.i = rwork[i__6]; + bx[i__4].r = q__1.r, bx[i__4].i = q__1.i; +/* L240: */ + } +/* L250: */ + } + +/* + Since B and BX are complex, the following call to SGEMM is + performed in two steps (real and imaginary parts). + + CALL SGEMM( 'T', 'N', NRP1, NRHS, NRP1, ONE, VT( NRF, 1 ), LDU, + $ B( NRF, 1 ), LDB, ZERO, BX( NRF, 1 ), LDBX ) +*/ + + j = nrp1 * *nrhs << 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nrf + nrp1 - 1; + for (jrow = nrf; jrow <= i__3; ++jrow) { + ++j; + i__4 = jrow + jcol * b_dim1; + rwork[j] = b[i__4].r; +/* L260: */ + } +/* L270: */ + } + sgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b1034, &vt[nrf + vt_dim1], + ldu, &rwork[(nrp1 * *nrhs << 1) + 1], &nrp1, &c_b328, &rwork[ + 1], &nrp1); + j = nrp1 * *nrhs << 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nrf + nrp1 - 1; + for (jrow = nrf; jrow <= i__3; ++jrow) { + ++j; + rwork[j] = r_imag(&b[jrow + jcol * b_dim1]); +/* L280: */ + } +/* L290: */ + } + sgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b1034, &vt[nrf + vt_dim1], + ldu, &rwork[(nrp1 * *nrhs << 1) + 1], &nrp1, &c_b328, &rwork[ + nrp1 * *nrhs + 1], &nrp1); + jreal = 0; + jimag = nrp1 * *nrhs; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = nrf + nrp1 - 1; + for (jrow = nrf; jrow <= i__3; ++jrow) { + ++jreal; + ++jimag; + i__4 = jrow + jcol * bx_dim1; + i__5 = jreal; + i__6 = jimag; + q__1.r = rwork[i__5], q__1.i = rwork[i__6]; + bx[i__4].r = q__1.r, bx[i__4].i = q__1.i; +/* L300: */ + } +/* L310: */ + } + +/* L320: */ + } + +L330: + + return 0; + +/* End of CLALSA */ + +} /* clalsa_ */ + +/* Subroutine */ int clalsd_(char *uplo, integer *smlsiz, integer *n, integer + *nrhs, real *d__, real *e, complex *b, integer *ldb, real *rcond, + integer *rank, complex *work, real *rwork, integer *iwork, integer * + info) +{ + /* System generated locals */ + integer b_dim1, b_offset, i__1, i__2, i__3, i__4, i__5, i__6; + real r__1; + complex q__1; + + /* Local variables */ + static integer c__, i__, j, k; + static real r__; + static integer s, u, z__; + static real cs; + static integer bx; + static real sn; + static integer st, vt, nm1, st1; + static real eps; + static integer iwk; + static real tol; + static integer difl, difr; + static real rcnd; + static integer jcol, irwb, perm, nsub, nlvl, sqre, bxst, jrow, irwu, + jimag, jreal; + extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *); + static integer irwib; + extern /* Subroutine */ int ccopy_(integer *, complex *, integer *, + complex *, integer *); + static integer poles, sizei, irwrb, nsize; + extern /* Subroutine */ int csrot_(integer *, complex *, integer *, + complex *, integer *, real *, real *); + static integer irwvt, icmpq1, icmpq2; + extern /* Subroutine */ int clalsa_(integer *, integer *, integer *, + integer *, complex *, integer *, complex *, integer *, real *, + integer *, real *, integer *, real *, real *, real *, real *, + integer *, integer *, integer *, integer *, real *, real *, real * + , real *, integer *, integer *), clascl_(char *, integer *, + integer *, real *, real *, integer *, integer *, complex *, + integer *, integer *); + extern doublereal slamch_(char *); + extern /* Subroutine */ int slasda_(integer *, integer *, integer *, + integer *, real *, real *, real *, integer *, real *, integer *, + real *, real *, real *, real *, integer *, integer *, integer *, + integer *, real *, real *, real *, real *, integer *, integer *), + clacpy_(char *, integer *, integer *, complex *, integer *, + complex *, integer *), claset_(char *, integer *, integer + *, complex *, complex *, complex *, integer *), xerbla_( + char *, integer *), slascl_(char *, integer *, integer *, + real *, real *, integer *, integer *, real *, integer *, integer * + ); + extern integer isamax_(integer *, real *, integer *); + static integer givcol; + extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer + *, integer *, integer *, real *, real *, real *, integer *, real * + , integer *, real *, integer *, real *, integer *), + slaset_(char *, integer *, integer *, real *, real *, real *, + integer *), slartg_(real *, real *, real *, real *, real * + ); + static real orgnrm; + static integer givnum; + extern doublereal slanst_(char *, integer *, real *, real *); + extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + static integer givptr, nrwork, irwwrk, smlszp; + + +/* + -- LAPACK routine (version 3.2.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + June 2010 + + + Purpose + ======= + + CLALSD uses the singular value decomposition of A to solve the least + squares problem of finding X to minimize the Euclidean norm of each + column of A*X-B, where A is N-by-N upper bidiagonal, and X and B + are N-by-NRHS. The solution X overwrites B. + + The singular values of A smaller than RCOND times the largest + singular value are treated as zero in solving the least squares + problem; in this case a minimum norm solution is returned. + The actual singular values are returned in D in ascending order. + + This code makes very mild assumptions about floating point + arithmetic. It will work on machines with a guard digit in + add/subtract, or on those binary machines without guard digits + which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. + It could conceivably fail on hexadecimal or decimal machines + without guard digits, but we know of none. + + Arguments + ========= + + UPLO (input) CHARACTER*1 + = 'U': D and E define an upper bidiagonal matrix. + = 'L': D and E define a lower bidiagonal matrix. + + SMLSIZ (input) INTEGER + The maximum size of the subproblems at the bottom of the + computation tree. + + N (input) INTEGER + The dimension of the bidiagonal matrix. N >= 0. + + NRHS (input) INTEGER + The number of columns of B. NRHS must be at least 1. + + D (input/output) REAL array, dimension (N) + On entry D contains the main diagonal of the bidiagonal + matrix. On exit, if INFO = 0, D contains its singular values. + + E (input/output) REAL array, dimension (N-1) + Contains the super-diagonal entries of the bidiagonal matrix. + On exit, E has been destroyed. + + B (input/output) COMPLEX array, dimension (LDB,NRHS) + On input, B contains the right hand sides of the least + squares problem. On output, B contains the solution X. + + LDB (input) INTEGER + The leading dimension of B in the calling subprogram. + LDB must be at least max(1,N). + + RCOND (input) REAL + The singular values of A less than or equal to RCOND times + the largest singular value are treated as zero in solving + the least squares problem. If RCOND is negative, + machine precision is used instead. + For example, if diag(S)*X=B were the least squares problem, + where diag(S) is a diagonal matrix of singular values, the + solution would be X(i) = B(i) / S(i) if S(i) is greater than + RCOND*max(S), and X(i) = 0 if S(i) is less than or equal to + RCOND*max(S). + + RANK (output) INTEGER + The number of singular values of A greater than RCOND times + the largest singular value. + + WORK (workspace) COMPLEX array, dimension (N * NRHS). + + RWORK (workspace) REAL array, dimension at least + (9*N + 2*N*SMLSIZ + 8*N*NLVL + 3*SMLSIZ*NRHS + + MAX( (SMLSIZ+1)**2, N*(1+NRHS) + 2*NRHS ), + where + NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 ) + + IWORK (workspace) INTEGER array, dimension (3*N*NLVL + 11*N). + + INFO (output) INTEGER + = 0: successful exit. + < 0: if INFO = -i, the i-th argument had an illegal value. + > 0: The algorithm failed to compute a singular value while + working on the submatrix lying in rows and columns + INFO/(N+1) through MOD(INFO,N+1). + + Further Details + =============== + + Based on contributions by + Ming Gu and Ren-Cang Li, Computer Science Division, University of + California at Berkeley, USA + Osni Marques, LBNL/NERSC, USA + + ===================================================================== + + + Test the input parameters. +*/ + + /* Parameter adjustments */ + --d__; + --e; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + --work; + --rwork; + --iwork; + + /* Function Body */ + *info = 0; + + if (*n < 0) { + *info = -3; + } else if (*nrhs < 1) { + *info = -4; + } else if (*ldb < 1 || *ldb < *n) { + *info = -8; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("CLALSD", &i__1); + return 0; + } + + eps = slamch_("Epsilon"); + +/* Set up the tolerance. */ + + if (*rcond <= 0.f || *rcond >= 1.f) { + rcnd = eps; + } else { + rcnd = *rcond; + } + + *rank = 0; + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } else if (*n == 1) { + if (d__[1] == 0.f) { + claset_("A", &c__1, nrhs, &c_b56, &c_b56, &b[b_offset], ldb); + } else { + *rank = 1; + clascl_("G", &c__0, &c__0, &d__[1], &c_b1034, &c__1, nrhs, &b[ + b_offset], ldb, info); + d__[1] = dabs(d__[1]); + } + return 0; + } + +/* Rotate the matrix if it is lower bidiagonal. */ + + if (*(unsigned char *)uplo == 'L') { + i__1 = *n - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + slartg_(&d__[i__], &e[i__], &cs, &sn, &r__); + d__[i__] = r__; + e[i__] = sn * d__[i__ + 1]; + d__[i__ + 1] = cs * d__[i__ + 1]; + if (*nrhs == 1) { + csrot_(&c__1, &b[i__ + b_dim1], &c__1, &b[i__ + 1 + b_dim1], & + c__1, &cs, &sn); + } else { + rwork[(i__ << 1) - 1] = cs; + rwork[i__ * 2] = sn; + } +/* L10: */ + } + if (*nrhs > 1) { + i__1 = *nrhs; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = *n - 1; + for (j = 1; j <= i__2; ++j) { + cs = rwork[(j << 1) - 1]; + sn = rwork[j * 2]; + csrot_(&c__1, &b[j + i__ * b_dim1], &c__1, &b[j + 1 + i__ + * b_dim1], &c__1, &cs, &sn); +/* L20: */ + } +/* L30: */ + } + } + } + +/* Scale. */ + + nm1 = *n - 1; + orgnrm = slanst_("M", n, &d__[1], &e[1]); + if (orgnrm == 0.f) { + claset_("A", n, nrhs, &c_b56, &c_b56, &b[b_offset], ldb); + return 0; + } + + slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, &c__1, &d__[1], n, info); + slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &nm1, &c__1, &e[1], &nm1, + info); + +/* + If N is smaller than the minimum divide size SMLSIZ, then solve + the problem with another solver. +*/ + + if (*n <= *smlsiz) { + irwu = 1; + irwvt = irwu + *n * *n; + irwwrk = irwvt + *n * *n; + irwrb = irwwrk; + irwib = irwrb + *n * *nrhs; + irwb = irwib + *n * *nrhs; + slaset_("A", n, n, &c_b328, &c_b1034, &rwork[irwu], n); + slaset_("A", n, n, &c_b328, &c_b1034, &rwork[irwvt], n); + slasdq_("U", &c__0, n, n, n, &c__0, &d__[1], &e[1], &rwork[irwvt], n, + &rwork[irwu], n, &rwork[irwwrk], &c__1, &rwork[irwwrk], info); + if (*info != 0) { + return 0; + } + +/* + In the real version, B is passed to SLASDQ and multiplied + internally by Q'. Here B is complex and that product is + computed below in two steps (real and imaginary parts). +*/ + + j = irwb - 1; + i__1 = *nrhs; + for (jcol = 1; jcol <= i__1; ++jcol) { + i__2 = *n; + for (jrow = 1; jrow <= i__2; ++jrow) { + ++j; + i__3 = jrow + jcol * b_dim1; + rwork[j] = b[i__3].r; +/* L40: */ + } +/* L50: */ + } + sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwu], n, &rwork[irwb], + n, &c_b328, &rwork[irwrb], n); + j = irwb - 1; + i__1 = *nrhs; + for (jcol = 1; jcol <= i__1; ++jcol) { + i__2 = *n; + for (jrow = 1; jrow <= i__2; ++jrow) { + ++j; + rwork[j] = r_imag(&b[jrow + jcol * b_dim1]); +/* L60: */ + } +/* L70: */ + } + sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwu], n, &rwork[irwb], + n, &c_b328, &rwork[irwib], n); + jreal = irwrb - 1; + jimag = irwib - 1; + i__1 = *nrhs; + for (jcol = 1; jcol <= i__1; ++jcol) { + i__2 = *n; + for (jrow = 1; jrow <= i__2; ++jrow) { + ++jreal; + ++jimag; + i__3 = jrow + jcol * b_dim1; + i__4 = jreal; + i__5 = jimag; + q__1.r = rwork[i__4], q__1.i = rwork[i__5]; + b[i__3].r = q__1.r, b[i__3].i = q__1.i; +/* L80: */ + } +/* L90: */ + } + + tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1)); + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (d__[i__] <= tol) { + claset_("A", &c__1, nrhs, &c_b56, &c_b56, &b[i__ + b_dim1], + ldb); + } else { + clascl_("G", &c__0, &c__0, &d__[i__], &c_b1034, &c__1, nrhs, & + b[i__ + b_dim1], ldb, info); + ++(*rank); + } +/* L100: */ + } + +/* + Since B is complex, the following call to SGEMM is performed + in two steps (real and imaginary parts). That is for V * B + (in the real version of the code V' is stored in WORK). + + CALL SGEMM( 'T', 'N', N, NRHS, N, ONE, WORK, N, B, LDB, ZERO, + $ WORK( NWORK ), N ) +*/ + + j = irwb - 1; + i__1 = *nrhs; + for (jcol = 1; jcol <= i__1; ++jcol) { + i__2 = *n; + for (jrow = 1; jrow <= i__2; ++jrow) { + ++j; + i__3 = jrow + jcol * b_dim1; + rwork[j] = b[i__3].r; +/* L110: */ + } +/* L120: */ + } + sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwvt], n, &rwork[irwb], + n, &c_b328, &rwork[irwrb], n); + j = irwb - 1; + i__1 = *nrhs; + for (jcol = 1; jcol <= i__1; ++jcol) { + i__2 = *n; + for (jrow = 1; jrow <= i__2; ++jrow) { + ++j; + rwork[j] = r_imag(&b[jrow + jcol * b_dim1]); +/* L130: */ + } +/* L140: */ + } + sgemm_("T", "N", n, nrhs, n, &c_b1034, &rwork[irwvt], n, &rwork[irwb], + n, &c_b328, &rwork[irwib], n); + jreal = irwrb - 1; + jimag = irwib - 1; + i__1 = *nrhs; + for (jcol = 1; jcol <= i__1; ++jcol) { + i__2 = *n; + for (jrow = 1; jrow <= i__2; ++jrow) { + ++jreal; + ++jimag; + i__3 = jrow + jcol * b_dim1; + i__4 = jreal; + i__5 = jimag; + q__1.r = rwork[i__4], q__1.i = rwork[i__5]; + b[i__3].r = q__1.r, b[i__3].i = q__1.i; +/* L150: */ + } +/* L160: */ + } + +/* Unscale. */ + + slascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, n, &c__1, &d__[1], n, + info); + slasrt_("D", n, &d__[1], info); + clascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, nrhs, &b[b_offset], + ldb, info); + + return 0; + } + +/* Book-keeping and setting up some constants. */ + + nlvl = (integer) (log((real) (*n) / (real) (*smlsiz + 1)) / log(2.f)) + 1; + + smlszp = *smlsiz + 1; + + u = 1; + vt = *smlsiz * *n + 1; + difl = vt + smlszp * *n; + difr = difl + nlvl * *n; + z__ = difr + (nlvl * *n << 1); + c__ = z__ + nlvl * *n; + s = c__ + *n; + poles = s + *n; + givnum = poles + (nlvl << 1) * *n; + nrwork = givnum + (nlvl << 1) * *n; + bx = 1; + + irwrb = nrwork; + irwib = irwrb + *smlsiz * *nrhs; + irwb = irwib + *smlsiz * *nrhs; + + sizei = *n + 1; + k = sizei + *n; + givptr = k + *n; + perm = givptr + *n; + givcol = perm + nlvl * *n; + iwk = givcol + (nlvl * *n << 1); + + st = 1; + sqre = 0; + icmpq1 = 1; + icmpq2 = 0; + nsub = 0; + + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if ((r__1 = d__[i__], dabs(r__1)) < eps) { + d__[i__] = r_sign(&eps, &d__[i__]); + } +/* L170: */ + } + + i__1 = nm1; + for (i__ = 1; i__ <= i__1; ++i__) { + if ((r__1 = e[i__], dabs(r__1)) < eps || i__ == nm1) { + ++nsub; + iwork[nsub] = st; + +/* + Subproblem found. First determine its size and then + apply divide and conquer on it. +*/ + + if (i__ < nm1) { + +/* A subproblem with E(I) small for I < NM1. */ + + nsize = i__ - st + 1; + iwork[sizei + nsub - 1] = nsize; + } else if ((r__1 = e[i__], dabs(r__1)) >= eps) { + +/* A subproblem with E(NM1) not too small but I = NM1. */ + + nsize = *n - st + 1; + iwork[sizei + nsub - 1] = nsize; + } else { + +/* + A subproblem with E(NM1) small. This implies an + 1-by-1 subproblem at D(N), which is not solved + explicitly. +*/ + + nsize = i__ - st + 1; + iwork[sizei + nsub - 1] = nsize; + ++nsub; + iwork[nsub] = *n; + iwork[sizei + nsub - 1] = 1; + ccopy_(nrhs, &b[*n + b_dim1], ldb, &work[bx + nm1], n); + } + st1 = st - 1; + if (nsize == 1) { + +/* + This is a 1-by-1 subproblem and is not solved + explicitly. +*/ + + ccopy_(nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n); + } else if (nsize <= *smlsiz) { + +/* This is a small subproblem and is solved by SLASDQ. */ + + slaset_("A", &nsize, &nsize, &c_b328, &c_b1034, &rwork[vt + + st1], n); + slaset_("A", &nsize, &nsize, &c_b328, &c_b1034, &rwork[u + + st1], n); + slasdq_("U", &c__0, &nsize, &nsize, &nsize, &c__0, &d__[st], & + e[st], &rwork[vt + st1], n, &rwork[u + st1], n, & + rwork[nrwork], &c__1, &rwork[nrwork], info) + ; + if (*info != 0) { + return 0; + } + +/* + In the real version, B is passed to SLASDQ and multiplied + internally by Q'. Here B is complex and that product is + computed below in two steps (real and imaginary parts). +*/ + + j = irwb - 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = st + nsize - 1; + for (jrow = st; jrow <= i__3; ++jrow) { + ++j; + i__4 = jrow + jcol * b_dim1; + rwork[j] = b[i__4].r; +/* L180: */ + } +/* L190: */ + } + sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[u + + st1], n, &rwork[irwb], &nsize, &c_b328, &rwork[irwrb], + &nsize); + j = irwb - 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = st + nsize - 1; + for (jrow = st; jrow <= i__3; ++jrow) { + ++j; + rwork[j] = r_imag(&b[jrow + jcol * b_dim1]); +/* L200: */ + } +/* L210: */ + } + sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[u + + st1], n, &rwork[irwb], &nsize, &c_b328, &rwork[irwib], + &nsize); + jreal = irwrb - 1; + jimag = irwib - 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = st + nsize - 1; + for (jrow = st; jrow <= i__3; ++jrow) { + ++jreal; + ++jimag; + i__4 = jrow + jcol * b_dim1; + i__5 = jreal; + i__6 = jimag; + q__1.r = rwork[i__5], q__1.i = rwork[i__6]; + b[i__4].r = q__1.r, b[i__4].i = q__1.i; +/* L220: */ + } +/* L230: */ + } + + clacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx + + st1], n); + } else { + +/* A large problem. Solve it using divide and conquer. */ + + slasda_(&icmpq1, smlsiz, &nsize, &sqre, &d__[st], &e[st], & + rwork[u + st1], n, &rwork[vt + st1], &iwork[k + st1], + &rwork[difl + st1], &rwork[difr + st1], &rwork[z__ + + st1], &rwork[poles + st1], &iwork[givptr + st1], & + iwork[givcol + st1], n, &iwork[perm + st1], &rwork[ + givnum + st1], &rwork[c__ + st1], &rwork[s + st1], & + rwork[nrwork], &iwork[iwk], info); + if (*info != 0) { + return 0; + } + bxst = bx + st1; + clalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, & + work[bxst], n, &rwork[u + st1], n, &rwork[vt + st1], & + iwork[k + st1], &rwork[difl + st1], &rwork[difr + st1] + , &rwork[z__ + st1], &rwork[poles + st1], &iwork[ + givptr + st1], &iwork[givcol + st1], n, &iwork[perm + + st1], &rwork[givnum + st1], &rwork[c__ + st1], &rwork[ + s + st1], &rwork[nrwork], &iwork[iwk], info); + if (*info != 0) { + return 0; + } + } + st = i__ + 1; + } +/* L240: */ + } + +/* Apply the singular values and treat the tiny ones as zero. */ + + tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1)); + + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* + Some of the elements in D can be negative because 1-by-1 + subproblems were not solved explicitly. +*/ + + if ((r__1 = d__[i__], dabs(r__1)) <= tol) { + claset_("A", &c__1, nrhs, &c_b56, &c_b56, &work[bx + i__ - 1], n); + } else { + ++(*rank); + clascl_("G", &c__0, &c__0, &d__[i__], &c_b1034, &c__1, nrhs, & + work[bx + i__ - 1], n, info); + } + d__[i__] = (r__1 = d__[i__], dabs(r__1)); +/* L250: */ + } + +/* Now apply back the right singular vectors. */ + + icmpq2 = 1; + i__1 = nsub; + for (i__ = 1; i__ <= i__1; ++i__) { + st = iwork[i__]; + st1 = st - 1; + nsize = iwork[sizei + i__ - 1]; + bxst = bx + st1; + if (nsize == 1) { + ccopy_(nrhs, &work[bxst], n, &b[st + b_dim1], ldb); + } else if (nsize <= *smlsiz) { + +/* + Since B and BX are complex, the following call to SGEMM + is performed in two steps (real and imaginary parts). + + CALL SGEMM( 'T', 'N', NSIZE, NRHS, NSIZE, ONE, + $ RWORK( VT+ST1 ), N, RWORK( BXST ), N, ZERO, + $ B( ST, 1 ), LDB ) +*/ + + j = bxst - *n - 1; + jreal = irwb - 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + j += *n; + i__3 = nsize; + for (jrow = 1; jrow <= i__3; ++jrow) { + ++jreal; + i__4 = j + jrow; + rwork[jreal] = work[i__4].r; +/* L260: */ + } +/* L270: */ + } + sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[vt + st1], + n, &rwork[irwb], &nsize, &c_b328, &rwork[irwrb], &nsize); + j = bxst - *n - 1; + jimag = irwb - 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + j += *n; + i__3 = nsize; + for (jrow = 1; jrow <= i__3; ++jrow) { + ++jimag; + rwork[jimag] = r_imag(&work[j + jrow]); +/* L280: */ + } +/* L290: */ + } + sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b1034, &rwork[vt + st1], + n, &rwork[irwb], &nsize, &c_b328, &rwork[irwib], &nsize); + jreal = irwrb - 1; + jimag = irwib - 1; + i__2 = *nrhs; + for (jcol = 1; jcol <= i__2; ++jcol) { + i__3 = st + nsize - 1; + for (jrow = st; jrow <= i__3; ++jrow) { + ++jreal; + ++jimag; + i__4 = jrow + jcol * b_dim1; + i__5 = jreal; + i__6 = jimag; + q__1.r = rwork[i__5], q__1.i = rwork[i__6]; + b[i__4].r = q__1.r, b[i__4].i = q__1.i; +/* L300: */ + } +/* L310: */ + } + } else { + clalsa_(&icmpq2, smlsiz, &nsize, nrhs, &work[bxst], n, &b[st + + b_dim1], ldb, &rwork[u + st1], n, &rwork[vt + st1], & + iwork[k + st1], &rwork[difl + st1], &rwork[difr + st1], & + rwork[z__ + st1], &rwork[poles + st1], &iwork[givptr + + st1], &iwork[givcol + st1], n, &iwork[perm + st1], &rwork[ + givnum + st1], &rwork[c__ + st1], &rwork[s + st1], &rwork[ + nrwork], &iwork[iwk], info); + if (*info != 0) { + return 0; + } + } +/* L320: */ + } + +/* Unscale and sort the singular values. */ + + slascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, n, &c__1, &d__[1], n, info); + slasrt_("D", n, &d__[1], info); + clascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, n, nrhs, &b[b_offset], ldb, + info); + + return 0; + +/* End of CLALSD */ + +} /* clalsd_ */ + doublereal clange_(char *norm, integer *m, integer *n, complex *a, integer * lda, real *work) { @@ -15487,8 +18145,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * } l = *m * *n + 1; - sgemm_("N", "N", m, n, m, &c_b894, &a[a_offset], lda, &rwork[1], m, & - c_b1087, &rwork[l], m); + sgemm_("N", "N", m, n, m, &c_b1034, &a[a_offset], lda, &rwork[1], m, & + c_b328, &rwork[l], m); i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = *m; @@ -15510,8 +18168,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * } /* L60: */ } - sgemm_("N", "N", m, n, m, &c_b894, &a[a_offset], lda, &rwork[1], m, & - c_b1087, &rwork[l], m); + sgemm_("N", "N", m, n, m, &c_b1034, &a[a_offset], lda, &rwork[1], m, & + c_b328, &rwork[l], m); i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = *m; @@ -19708,7 +22366,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * /* Scale x by 1/2. */ - csscal_(n, &c_b2023, &x[1], &c__1); + csscal_(n, &c_b2435, &x[1], &c__1); *scale *= .5f; } @@ -20515,8 +23173,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * a[i__ + (i__ + ib) * a_dim1], lda, &c_b57, &a[i__ * a_dim1 + 1], lda); i__3 = *n - i__ - ib + 1; - cherk_("Upper", "No transpose", &ib, &i__3, &c_b894, &a[ - i__ + (i__ + ib) * a_dim1], lda, &c_b894, &a[i__ + cherk_("Upper", "No transpose", &ib, &i__3, &c_b1034, &a[ + i__ + (i__ + ib) * a_dim1], lda, &c_b1034, &a[i__ + i__ * a_dim1], lda); } /* L10: */ @@ -20545,8 +23203,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * a_dim1], lda); i__3 = *n - i__ - ib + 1; cherk_("Lower", "Conjugate transpose", &ib, &i__3, & - c_b894, &a[i__ + ib + i__ * a_dim1], lda, &c_b894, - &a[i__ + i__ * a_dim1], lda); + c_b1034, &a[i__ + ib + i__ * a_dim1], lda, & + c_b1034, &a[i__ + i__ * a_dim1], lda); } /* L20: */ } @@ -20910,8 +23568,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * i__3 = nb, i__4 = *n - j + 1; jb = min(i__3,i__4); i__3 = j - 1; - cherk_("Upper", "Conjugate transpose", &jb, &i__3, &c_b1136, & - a[j * a_dim1 + 1], lda, &c_b894, &a[j + j * a_dim1], + cherk_("Upper", "Conjugate transpose", &jb, &i__3, &c_b1276, & + a[j * a_dim1 + 1], lda, &c_b1034, &a[j + j * a_dim1], lda); cpotf2_("Upper", &jb, &a[j + j * a_dim1], lda, info); if (*info != 0) { @@ -20953,8 +23611,8 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * i__3 = nb, i__4 = *n - j + 1; jb = min(i__3,i__4); i__3 = j - 1; - cherk_("Lower", "No transpose", &jb, &i__3, &c_b1136, &a[j + - a_dim1], lda, &c_b894, &a[j + j * a_dim1], lda); + cherk_("Lower", "No transpose", &jb, &i__3, &c_b1276, &a[j + + a_dim1], lda, &c_b1034, &a[j + j * a_dim1], lda); cpotf2_("Lower", &jb, &a[j + j * a_dim1], lda, info); if (*info != 0) { goto L30; @@ -21681,7 +24339,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * /* If COMPZ = 'I', we simply call SSTEDC instead. */ if (icompz == 2) { - slaset_("Full", n, n, &c_b1087, &c_b894, &rwork[1], n); + slaset_("Full", n, n, &c_b328, &c_b1034, &rwork[1], n); ll = *n * *n + 1; i__1 = *lrwork - ll + 1; sstedc_("I", n, &d__[1], &e[1], &rwork[1], n, &rwork[ll], &i__1, & @@ -21748,12 +24406,12 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * /* Scale. */ orgnrm = slanst_("M", &m, &d__[start], &e[start]); - slascl_("G", &c__0, &c__0, &orgnrm, &c_b894, &m, &c__1, &d__[ + slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &m, &c__1, &d__[ start], &m, info); i__1 = m - 1; i__2 = m - 1; - slascl_("G", &c__0, &c__0, &orgnrm, &c_b894, &i__1, &c__1, &e[ - start], &i__2, info); + slascl_("G", &c__0, &c__0, &orgnrm, &c_b1034, &i__1, &c__1, & + e[start], &i__2, info); claed0_(n, &m, &d__[start], &e[start], &z__[start * z_dim1 + 1], ldz, &work[1], n, &rwork[1], &iwork[1], info); @@ -21765,7 +24423,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * /* Scale back. */ - slascl_("G", &c__0, &c__0, &c_b894, &orgnrm, &m, &c__1, &d__[ + slascl_("G", &c__0, &c__0, &c_b1034, &orgnrm, &m, &c__1, &d__[ start], &m, info); } else { @@ -22166,7 +24824,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * /* Form shift. */ g = (d__[l + 1] - p) / (e[l] * 2.f); - r__ = slapy2_(&g, &c_b894); + r__ = slapy2_(&g, &c_b1034); g = d__[m] - p + e[l] / (g + r_sign(&r__, &g)); s = 1.f; @@ -22292,7 +24950,7 @@ doublereal clanhe_(char *norm, char *uplo, integer *n, complex *a, integer * /* Form shift. */ g = (d__[l - 1] - p) / (e[l - 1] * 2.f); - r__ = slapy2_(&g, &c_b894); + r__ = slapy2_(&g, &c_b1034); g = d__[m] - p + e[l - 1] / (g + r_sign(&r__, &g)); s = 1.f; diff --git a/numpy/linalg/lapack_lite/f2c_s_lapack.c b/numpy/linalg/lapack_lite/f2c_s_lapack.c index 04080f81da97..fccb1f58b553 100644 --- a/numpy/linalg/lapack_lite/f2c_s_lapack.c +++ b/numpy/linalg/lapack_lite/f2c_s_lapack.c @@ -40,6 +40,7 @@ static integer c_n1 = -1; static integer c__3 = 3; static integer c__2 = 2; static integer c__65 = 65; +static integer c__6 = 6; static integer c__12 = 12; static integer c__49 = 49; static integer c__4 = 4; @@ -49,7 +50,7 @@ static integer c__15 = 15; static integer c__14 = 14; static integer c__16 = 16; static logical c_true = TRUE_; -static real c_b2863 = 2.f; +static real c_b3178 = 2.f; /* Subroutine */ int sbdsdc_(char *uplo, char *compq, integer *n, real *d__, real *e, real *u, integer *ldu, real *vt, integer *ldvt, real *q, @@ -4025,189 +4026,754 @@ static real c_b2863 = 2.f; } /* sgelqf_ */ -/* Subroutine */ int sgeqr2_(integer *m, integer *n, real *a, integer *lda, - real *tau, real *work, integer *info) +/* Subroutine */ int sgelsd_(integer *m, integer *n, integer *nrhs, real *a, + integer *lda, real *b, integer *ldb, real *s, real *rcond, integer * + rank, real *work, integer *lwork, integer *iwork, integer *info) { /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3; + integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2, i__3, i__4; /* Local variables */ - static integer i__, k; - static real aii; - extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, - integer *, real *, real *, integer *, real *), xerbla_( - char *, integer *), slarfg_(integer *, real *, real *, - integer *, real *); + static integer ie, il, mm; + static real eps, anrm, bnrm; + static integer itau, nlvl, iascl, ibscl; + static real sfmin; + static integer minmn, maxmn, itaup, itauq, mnthr, nwork; + extern /* Subroutine */ int slabad_(real *, real *), sgebrd_(integer *, + integer *, real *, integer *, real *, real *, real *, real *, + real *, integer *, integer *); + extern doublereal slamch_(char *), slange_(char *, integer *, + integer *, real *, integer *, real *); + extern /* Subroutine */ int xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + static real bignum; + extern /* Subroutine */ int sgelqf_(integer *, integer *, real *, integer + *, real *, real *, integer *, integer *), slalsd_(char *, integer + *, integer *, integer *, real *, real *, real *, integer *, real * + , integer *, real *, integer *, integer *), slascl_(char * + , integer *, integer *, real *, real *, integer *, integer *, + real *, integer *, integer *); + static integer wlalsd; + extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer + *, real *, real *, integer *, integer *), slacpy_(char *, integer + *, integer *, real *, integer *, real *, integer *), + slaset_(char *, integer *, integer *, real *, real *, real *, + integer *); + static integer ldwork; + extern /* Subroutine */ int sormbr_(char *, char *, char *, integer *, + integer *, integer *, real *, integer *, real *, real *, integer * + , real *, integer *, integer *); + static integer liwork, minwrk, maxwrk; + static real smlnum; + extern /* Subroutine */ int sormlq_(char *, char *, integer *, integer *, + integer *, real *, integer *, real *, real *, integer *, real *, + integer *, integer *); + static logical lquery; + static integer smlsiz; + extern /* Subroutine */ int sormqr_(char *, char *, integer *, integer *, + integer *, real *, integer *, real *, real *, integer *, real *, + integer *, integer *); /* - -- LAPACK routine (version 3.2.2) -- + -- LAPACK driver routine (version 3.2) -- -- LAPACK is a software package provided by Univ. of Tennessee, -- -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - June 2010 + November 2006 Purpose ======= - SGEQR2 computes a QR factorization of a real m by n matrix A: - A = Q * R. + SGELSD computes the minimum-norm solution to a real linear least + squares problem: + minimize 2-norm(| b - A*x |) + using the singular value decomposition (SVD) of A. A is an M-by-N + matrix which may be rank-deficient. + + Several right hand side vectors b and solution vectors x can be + handled in a single call; they are stored as the columns of the + M-by-NRHS right hand side matrix B and the N-by-NRHS solution + matrix X. + + The problem is solved in three steps: + (1) Reduce the coefficient matrix A to bidiagonal form with + Householder transformations, reducing the original problem + into a "bidiagonal least squares problem" (BLS) + (2) Solve the BLS using a divide and conquer approach. + (3) Apply back all the Householder tranformations to solve + the original least squares problem. + + The effective rank of A is determined by treating as zero those + singular values which are less than RCOND times the largest singular + value. + + The divide and conquer algorithm makes very mild assumptions about + floating point arithmetic. It will work on machines with a guard + digit in add/subtract, or on those binary machines without guard + digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or + Cray-2. It could conceivably fail on hexadecimal or decimal machines + without guard digits, but we know of none. Arguments ========= M (input) INTEGER - The number of rows of the matrix A. M >= 0. + The number of rows of A. M >= 0. N (input) INTEGER - The number of columns of the matrix A. N >= 0. + The number of columns of A. N >= 0. - A (input/output) REAL array, dimension (LDA,N) - On entry, the m by n matrix A. - On exit, the elements on and above the diagonal of the array - contain the min(m,n) by n upper trapezoidal matrix R (R is - upper triangular if m >= n); the elements below the diagonal, - with the array TAU, represent the orthogonal matrix Q as a - product of elementary reflectors (see Further Details). + NRHS (input) INTEGER + The number of right hand sides, i.e., the number of columns + of the matrices B and X. NRHS >= 0. + + A (input) REAL array, dimension (LDA,N) + On entry, the M-by-N matrix A. + On exit, A has been destroyed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). - TAU (output) REAL array, dimension (min(M,N)) - The scalar factors of the elementary reflectors (see Further - Details). + B (input/output) REAL array, dimension (LDB,NRHS) + On entry, the M-by-NRHS right hand side matrix B. + On exit, B is overwritten by the N-by-NRHS solution + matrix X. If m >= n and RANK = n, the residual + sum-of-squares for the solution in the i-th column is given + by the sum of squares of elements n+1:m in that column. - WORK (workspace) REAL array, dimension (N) + LDB (input) INTEGER + The leading dimension of the array B. LDB >= max(1,max(M,N)). - INFO (output) INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value + S (output) REAL array, dimension (min(M,N)) + The singular values of A in decreasing order. + The condition number of A in the 2-norm = S(1)/S(min(m,n)). - Further Details - =============== + RCOND (input) REAL + RCOND is used to determine the effective rank of A. + Singular values S(i) <= RCOND*S(1) are treated as zero. + If RCOND < 0, machine precision is used instead. - The matrix Q is represented as a product of elementary reflectors + RANK (output) INTEGER + The effective rank of A, i.e., the number of singular values + which are greater than RCOND*S(1). - Q = H(1) H(2) . . . H(k), where k = min(m,n). + WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) + On exit, if INFO = 0, WORK(1) returns the optimal LWORK. - Each H(i) has the form + LWORK (input) INTEGER + The dimension of the array WORK. LWORK must be at least 1. + The exact minimum amount of workspace needed depends on M, + N and NRHS. As long as LWORK is at least + 12*N + 2*N*SMLSIZ + 8*N*NLVL + N*NRHS + (SMLSIZ+1)**2, + if M is greater than or equal to N or + 12*M + 2*M*SMLSIZ + 8*M*NLVL + M*NRHS + (SMLSIZ+1)**2, + if M is less than N, the code will execute correctly. + SMLSIZ is returned by ILAENV and is equal to the maximum + size of the subproblems at the bottom of the computation + tree (usually about 25), and + NLVL = MAX( 0, INT( LOG_2( MIN( M,N )/(SMLSIZ+1) ) ) + 1 ) + For good performance, LWORK should generally be larger. - H(i) = I - tau * v * v' + If LWORK = -1, then a workspace query is assumed; the routine + only calculates the optimal size of the array WORK and the + minimum size of the array IWORK, and returns these values as + the first entries of the WORK and IWORK arrays, and no error + message related to LWORK is issued by XERBLA. - where tau is a real scalar, and v is a real vector with - v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), - and tau in TAU(i). + IWORK (workspace) INTEGER array, dimension (MAX(1,LIWORK)) + LIWORK >= max(1, 3*MINMN*NLVL + 11*MINMN), + where MINMN = MIN( M,N ). + On exit, if INFO = 0, IWORK(1) returns the minimum LIWORK. + + INFO (output) INTEGER + = 0: successful exit + < 0: if INFO = -i, the i-th argument had an illegal value. + > 0: the algorithm for computing the SVD failed to converge; + if INFO = i, i off-diagonal elements of an intermediate + bidiagonal form did not converge to zero. + + Further Details + =============== + + Based on contributions by + Ming Gu and Ren-Cang Li, Computer Science Division, University of + California at Berkeley, USA + Osni Marques, LBNL/NERSC, USA ===================================================================== - Test the input arguments + Test the input arguments. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; - --tau; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + --s; --work; + --iwork; /* Function Body */ *info = 0; + minmn = min(*m,*n); + maxmn = max(*m,*n); + lquery = *lwork == -1; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; + } else if (*nrhs < 0) { + *info = -3; } else if (*lda < max(1,*m)) { - *info = -4; - } - if (*info != 0) { - i__1 = -(*info); - xerbla_("SGEQR2", &i__1); - return 0; + *info = -5; + } else if (*ldb < max(1,maxmn)) { + *info = -7; } - k = min(*m,*n); +/* + Compute workspace. + (Note: Comments in the code beginning "Workspace:" describe the + minimal amount of workspace needed at that point in the code, + as well as the preferred amount for good performance. + NB refers to the optimal block size for the immediately + following subroutine, as returned by ILAENV.) +*/ - i__1 = k; - for (i__ = 1; i__ <= i__1; ++i__) { + if (*info == 0) { + minwrk = 1; + maxwrk = 1; + liwork = 1; + if (minmn > 0) { + smlsiz = ilaenv_(&c__9, "SGELSD", " ", &c__0, &c__0, &c__0, &c__0, + (ftnlen)6, (ftnlen)1); + mnthr = ilaenv_(&c__6, "SGELSD", " ", m, n, nrhs, &c_n1, (ftnlen) + 6, (ftnlen)1); +/* Computing MAX */ + i__1 = (integer) (log((real) minmn / (real) (smlsiz + 1)) / log( + 2.f)) + 1; + nlvl = max(i__1,0); + liwork = minmn * 3 * nlvl + minmn * 11; + mm = *m; + if (*m >= *n && *m >= mnthr) { -/* Generate elementary reflector H(i) to annihilate A(i+1:m,i) */ +/* + Path 1a - overdetermined, with many more rows than + columns. +*/ - i__2 = *m - i__ + 1; -/* Computing MIN */ - i__3 = i__ + 1; - slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1] - , &c__1, &tau[i__]); - if (i__ < *n) { + mm = *n; +/* Computing MAX */ + i__1 = maxwrk, i__2 = *n + *n * ilaenv_(&c__1, "SGEQRF", + " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *n + *nrhs * ilaenv_(&c__1, "SORMQR", + "LT", m, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)2); + maxwrk = max(i__1,i__2); + } + if (*m >= *n) { -/* Apply H(i) to A(i:m,i+1:n) from the left */ +/* + Path 1 - overdetermined or exactly determined. - aii = a[i__ + i__ * a_dim1]; - a[i__ + i__ * a_dim1] = 1.f; - i__2 = *m - i__ + 1; - i__3 = *n - i__; - slarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &tau[ - i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]); - a[i__ + i__ * a_dim1] = aii; + Computing MAX +*/ + i__1 = maxwrk, i__2 = *n * 3 + (mm + *n) * ilaenv_(&c__1, + "SGEBRD", " ", &mm, n, &c_n1, &c_n1, (ftnlen)6, ( + ftnlen)1); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *n * 3 + *nrhs * ilaenv_(&c__1, "SORMBR" + , "QLT", &mm, nrhs, n, &c_n1, (ftnlen)6, (ftnlen)3); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *n * 3 + (*n - 1) * ilaenv_(&c__1, + "SORMBR", "PLN", n, nrhs, n, &c_n1, (ftnlen)6, ( + ftnlen)3); + maxwrk = max(i__1,i__2); +/* Computing 2nd power */ + i__1 = smlsiz + 1; + wlalsd = *n * 9 + (*n << 1) * smlsiz + (*n << 3) * nlvl + *n * + *nrhs + i__1 * i__1; +/* Computing MAX */ + i__1 = maxwrk, i__2 = *n * 3 + wlalsd; + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = *n * 3 + mm, i__2 = *n * 3 + *nrhs, i__1 = max(i__1, + i__2), i__2 = *n * 3 + wlalsd; + minwrk = max(i__1,i__2); + } + if (*n > *m) { +/* Computing 2nd power */ + i__1 = smlsiz + 1; + wlalsd = *m * 9 + (*m << 1) * smlsiz + (*m << 3) * nlvl + *m * + *nrhs + i__1 * i__1; + if (*n >= mnthr) { + +/* + Path 2a - underdetermined, with many more columns + than rows. +*/ + + maxwrk = *m + *m * ilaenv_(&c__1, "SGELQF", " ", m, n, & + c_n1, &c_n1, (ftnlen)6, (ftnlen)1); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m << 1) * + ilaenv_(&c__1, "SGEBRD", " ", m, m, &c_n1, &c_n1, + (ftnlen)6, (ftnlen)1); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + *nrhs * + ilaenv_(&c__1, "SORMBR", "QLT", m, nrhs, m, &c_n1, + (ftnlen)6, (ftnlen)3); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + (*m - 1) * + ilaenv_(&c__1, "SORMBR", "PLN", m, nrhs, m, &c_n1, + (ftnlen)6, (ftnlen)3); + maxwrk = max(i__1,i__2); + if (*nrhs > 1) { +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + *m + *m * *nrhs; + maxwrk = max(i__1,i__2); + } else { +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 1); + maxwrk = max(i__1,i__2); + } +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m + *nrhs * ilaenv_(&c__1, "SORMLQ" + , "LT", n, nrhs, m, &c_n1, (ftnlen)6, (ftnlen)2); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * *m + (*m << 2) + wlalsd; + maxwrk = max(i__1,i__2); +/* + XXX: Ensure the Path 2a case below is triggered. The workspace + calculation should use queries for all routines eventually. + Computing MAX + Computing MAX +*/ + i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), + i__3 = max(i__3,*nrhs), i__4 = *n - *m * 3; + i__1 = maxwrk, i__2 = (*m << 2) + *m * *m + max(i__3,i__4) + ; + maxwrk = max(i__1,i__2); + } else { + +/* Path 2 - remaining underdetermined cases. */ + + maxwrk = *m * 3 + (*n + *m) * ilaenv_(&c__1, "SGEBRD", + " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen)1); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * 3 + *nrhs * ilaenv_(&c__1, + "SORMBR", "QLT", m, nrhs, n, &c_n1, (ftnlen)6, ( + ftnlen)3); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * 3 + *m * ilaenv_(&c__1, "SORM" + "BR", "PLN", n, nrhs, m, &c_n1, (ftnlen)6, (ftnlen) + 3); + maxwrk = max(i__1,i__2); +/* Computing MAX */ + i__1 = maxwrk, i__2 = *m * 3 + wlalsd; + maxwrk = max(i__1,i__2); + } +/* Computing MAX */ + i__1 = *m * 3 + *nrhs, i__2 = *m * 3 + *m, i__1 = max(i__1, + i__2), i__2 = *m * 3 + wlalsd; + minwrk = max(i__1,i__2); + } + } + minwrk = min(minwrk,maxwrk); + work[1] = (real) maxwrk; + iwork[1] = liwork; + + if (*lwork < minwrk && ! lquery) { + *info = -12; } -/* L10: */ } - return 0; -/* End of SGEQR2 */ + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGELSD", &i__1); + return 0; + } else if (lquery) { + return 0; + } -} /* sgeqr2_ */ +/* Quick return if possible. */ -/* Subroutine */ int sgeqrf_(integer *m, integer *n, real *a, integer *lda, - real *tau, real *work, integer *lwork, integer *info) -{ - /* System generated locals */ - integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + if (*m == 0 || *n == 0) { + *rank = 0; + return 0; + } - /* Local variables */ - static integer i__, k, ib, nb, nx, iws, nbmin, iinfo; - extern /* Subroutine */ int sgeqr2_(integer *, integer *, real *, integer - *, real *, real *, integer *), slarfb_(char *, char *, char *, - char *, integer *, integer *, integer *, real *, integer *, real * - , integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *); - extern integer ilaenv_(integer *, char *, char *, integer *, integer *, - integer *, integer *, ftnlen, ftnlen); - extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, - real *, integer *, real *, real *, integer *); - static integer ldwork, lwkopt; - static logical lquery; +/* Get machine parameters. */ + eps = slamch_("P"); + sfmin = slamch_("S"); + smlnum = sfmin / eps; + bignum = 1.f / smlnum; + slabad_(&smlnum, &bignum); -/* - -- LAPACK routine (version 3.2) -- - -- LAPACK is a software package provided by Univ. of Tennessee, -- - -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- - November 2006 +/* Scale A if max entry outside range [SMLNUM,BIGNUM]. */ + anrm = slange_("M", m, n, &a[a_offset], lda, &work[1]); + iascl = 0; + if (anrm > 0.f && anrm < smlnum) { - Purpose - ======= +/* Scale matrix norm up to SMLNUM. */ - SGEQRF computes a QR factorization of a real M-by-N matrix A: - A = Q * R. + slascl_("G", &c__0, &c__0, &anrm, &smlnum, m, n, &a[a_offset], lda, + info); + iascl = 1; + } else if (anrm > bignum) { - Arguments - ========= +/* Scale matrix norm down to BIGNUM. */ - M (input) INTEGER - The number of rows of the matrix A. M >= 0. + slascl_("G", &c__0, &c__0, &anrm, &bignum, m, n, &a[a_offset], lda, + info); + iascl = 2; + } else if (anrm == 0.f) { + +/* Matrix all zero. Return zero solution. */ + + i__1 = max(*m,*n); + slaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[b_offset], ldb); + slaset_("F", &minmn, &c__1, &c_b29, &c_b29, &s[1], &c__1); + *rank = 0; + goto L10; + } + +/* Scale B if max entry outside range [SMLNUM,BIGNUM]. */ + + bnrm = slange_("M", m, nrhs, &b[b_offset], ldb, &work[1]); + ibscl = 0; + if (bnrm > 0.f && bnrm < smlnum) { + +/* Scale matrix norm up to SMLNUM. */ + + slascl_("G", &c__0, &c__0, &bnrm, &smlnum, m, nrhs, &b[b_offset], ldb, + info); + ibscl = 1; + } else if (bnrm > bignum) { + +/* Scale matrix norm down to BIGNUM. */ + + slascl_("G", &c__0, &c__0, &bnrm, &bignum, m, nrhs, &b[b_offset], ldb, + info); + ibscl = 2; + } + +/* If M < N make sure certain entries of B are zero. */ + + if (*m < *n) { + i__1 = *n - *m; + slaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[*m + 1 + b_dim1], ldb); + } + +/* Overdetermined case. */ + + if (*m >= *n) { + +/* Path 1 - overdetermined or exactly determined. */ + + mm = *m; + if (*m >= mnthr) { + +/* Path 1a - overdetermined, with many more rows than columns. */ + + mm = *n; + itau = 1; + nwork = itau + *n; + +/* + Compute A=Q*R. + (Workspace: need 2*N, prefer N+N*NB) +*/ + + i__1 = *lwork - nwork + 1; + sgeqrf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1, + info); + +/* + Multiply B by transpose(Q). + (Workspace: need N+NRHS, prefer N+NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + sormqr_("L", "T", m, nrhs, n, &a[a_offset], lda, &work[itau], &b[ + b_offset], ldb, &work[nwork], &i__1, info); + +/* Zero out below R. */ + + if (*n > 1) { + i__1 = *n - 1; + i__2 = *n - 1; + slaset_("L", &i__1, &i__2, &c_b29, &c_b29, &a[a_dim1 + 2], + lda); + } + } + + ie = 1; + itauq = ie + *n; + itaup = itauq + *n; + nwork = itaup + *n; + +/* + Bidiagonalize R in A. + (Workspace: need 3*N+MM, prefer 3*N+(MM+N)*NB) +*/ + + i__1 = *lwork - nwork + 1; + sgebrd_(&mm, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], & + work[itaup], &work[nwork], &i__1, info); + +/* + Multiply B by transpose of left bidiagonalizing vectors of R. + (Workspace: need 3*N+NRHS, prefer 3*N+NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + sormbr_("Q", "L", "T", &mm, nrhs, n, &a[a_offset], lda, &work[itauq], + &b[b_offset], ldb, &work[nwork], &i__1, info); + +/* Solve the bidiagonal least squares problem. */ + + slalsd_("U", &smlsiz, n, nrhs, &s[1], &work[ie], &b[b_offset], ldb, + rcond, rank, &work[nwork], &iwork[1], info); + if (*info != 0) { + goto L10; + } + +/* Multiply B by right bidiagonalizing vectors of R. */ + + i__1 = *lwork - nwork + 1; + sormbr_("P", "L", "N", n, nrhs, n, &a[a_offset], lda, &work[itaup], & + b[b_offset], ldb, &work[nwork], &i__1, info); + + } else /* if(complicated condition) */ { +/* Computing MAX */ + i__1 = *m, i__2 = (*m << 1) - 4, i__1 = max(i__1,i__2), i__1 = max( + i__1,*nrhs), i__2 = *n - *m * 3, i__1 = max(i__1,i__2); + if (*n >= mnthr && *lwork >= (*m << 2) + *m * *m + max(i__1,wlalsd)) { + +/* + Path 2a - underdetermined, with many more columns than rows + and sufficient workspace for an efficient algorithm. +*/ + + ldwork = *m; +/* + Computing MAX + Computing MAX +*/ + i__3 = *m, i__4 = (*m << 1) - 4, i__3 = max(i__3,i__4), i__3 = + max(i__3,*nrhs), i__4 = *n - *m * 3; + i__1 = (*m << 2) + *m * *lda + max(i__3,i__4), i__2 = *m * *lda + + *m + *m * *nrhs, i__1 = max(i__1,i__2), i__2 = (*m << 2) + + *m * *lda + wlalsd; + if (*lwork >= max(i__1,i__2)) { + ldwork = *lda; + } + itau = 1; + nwork = *m + 1; + +/* + Compute A=L*Q. + (Workspace: need 2*M, prefer M+M*NB) +*/ + + i__1 = *lwork - nwork + 1; + sgelqf_(m, n, &a[a_offset], lda, &work[itau], &work[nwork], &i__1, + info); + il = nwork; + +/* Copy L to WORK(IL), zeroing out above its diagonal. */ + + slacpy_("L", m, m, &a[a_offset], lda, &work[il], &ldwork); + i__1 = *m - 1; + i__2 = *m - 1; + slaset_("U", &i__1, &i__2, &c_b29, &c_b29, &work[il + ldwork], & + ldwork); + ie = il + ldwork * *m; + itauq = ie + *m; + itaup = itauq + *m; + nwork = itaup + *m; + +/* + Bidiagonalize L in WORK(IL). + (Workspace: need M*M+5*M, prefer M*M+4*M+2*M*NB) +*/ + + i__1 = *lwork - nwork + 1; + sgebrd_(m, m, &work[il], &ldwork, &s[1], &work[ie], &work[itauq], + &work[itaup], &work[nwork], &i__1, info); + +/* + Multiply B by transpose of left bidiagonalizing vectors of L. + (Workspace: need M*M+4*M+NRHS, prefer M*M+4*M+NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + sormbr_("Q", "L", "T", m, nrhs, m, &work[il], &ldwork, &work[ + itauq], &b[b_offset], ldb, &work[nwork], &i__1, info); + +/* Solve the bidiagonal least squares problem. */ + + slalsd_("U", &smlsiz, m, nrhs, &s[1], &work[ie], &b[b_offset], + ldb, rcond, rank, &work[nwork], &iwork[1], info); + if (*info != 0) { + goto L10; + } + +/* Multiply B by right bidiagonalizing vectors of L. */ + + i__1 = *lwork - nwork + 1; + sormbr_("P", "L", "N", m, nrhs, m, &work[il], &ldwork, &work[ + itaup], &b[b_offset], ldb, &work[nwork], &i__1, info); + +/* Zero out below first M rows of B. */ + + i__1 = *n - *m; + slaset_("F", &i__1, nrhs, &c_b29, &c_b29, &b[*m + 1 + b_dim1], + ldb); + nwork = itau + *m; + +/* + Multiply transpose(Q) by B. + (Workspace: need M+NRHS, prefer M+NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + sormlq_("L", "T", n, nrhs, m, &a[a_offset], lda, &work[itau], &b[ + b_offset], ldb, &work[nwork], &i__1, info); + + } else { + +/* Path 2 - remaining underdetermined cases. */ + + ie = 1; + itauq = ie + *m; + itaup = itauq + *m; + nwork = itaup + *m; + +/* + Bidiagonalize A. + (Workspace: need 3*M+N, prefer 3*M+(M+N)*NB) +*/ + + i__1 = *lwork - nwork + 1; + sgebrd_(m, n, &a[a_offset], lda, &s[1], &work[ie], &work[itauq], & + work[itaup], &work[nwork], &i__1, info); + +/* + Multiply B by transpose of left bidiagonalizing vectors. + (Workspace: need 3*M+NRHS, prefer 3*M+NRHS*NB) +*/ + + i__1 = *lwork - nwork + 1; + sormbr_("Q", "L", "T", m, nrhs, n, &a[a_offset], lda, &work[itauq] + , &b[b_offset], ldb, &work[nwork], &i__1, info); + +/* Solve the bidiagonal least squares problem. */ + + slalsd_("L", &smlsiz, m, nrhs, &s[1], &work[ie], &b[b_offset], + ldb, rcond, rank, &work[nwork], &iwork[1], info); + if (*info != 0) { + goto L10; + } + +/* Multiply B by right bidiagonalizing vectors of A. */ + + i__1 = *lwork - nwork + 1; + sormbr_("P", "L", "N", n, nrhs, m, &a[a_offset], lda, &work[itaup] + , &b[b_offset], ldb, &work[nwork], &i__1, info); + + } + } + +/* Undo scaling. */ + + if (iascl == 1) { + slascl_("G", &c__0, &c__0, &anrm, &smlnum, n, nrhs, &b[b_offset], ldb, + info); + slascl_("G", &c__0, &c__0, &smlnum, &anrm, &minmn, &c__1, &s[1], & + minmn, info); + } else if (iascl == 2) { + slascl_("G", &c__0, &c__0, &anrm, &bignum, n, nrhs, &b[b_offset], ldb, + info); + slascl_("G", &c__0, &c__0, &bignum, &anrm, &minmn, &c__1, &s[1], & + minmn, info); + } + if (ibscl == 1) { + slascl_("G", &c__0, &c__0, &smlnum, &bnrm, n, nrhs, &b[b_offset], ldb, + info); + } else if (ibscl == 2) { + slascl_("G", &c__0, &c__0, &bignum, &bnrm, n, nrhs, &b[b_offset], ldb, + info); + } + +L10: + work[1] = (real) maxwrk; + iwork[1] = liwork; + return 0; + +/* End of SGELSD */ + +} /* sgelsd_ */ + +/* Subroutine */ int sgeqr2_(integer *m, integer *n, real *a, integer *lda, + real *tau, real *work, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3; + + /* Local variables */ + static integer i__, k; + static real aii; + extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, + integer *, real *, real *, integer *, real *), xerbla_( + char *, integer *), slarfg_(integer *, real *, real *, + integer *, real *); + + +/* + -- LAPACK routine (version 3.2.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + June 2010 + + + Purpose + ======= + + SGEQR2 computes a QR factorization of a real m by n matrix A: + A = Q * R. + + Arguments + ========= + + M (input) INTEGER + The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input/output) REAL array, dimension (LDA,N) - On entry, the M-by-N matrix A. + On entry, the m by n matrix A. On exit, the elements on and above the diagonal of the array - contain the min(M,N)-by-N upper trapezoidal matrix R (R is + contain the min(m,n) by n upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal matrix Q as a - product of min(m,n) elementary reflectors (see Further - Details). + product of elementary reflectors (see Further Details). LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). @@ -4216,22 +4782,11 @@ static real c_b2863 = 2.f; The scalar factors of the elementary reflectors (see Further Details). - WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) - On exit, if INFO = 0, WORK(1) returns the optimal LWORK. - - LWORK (input) INTEGER - The dimension of the array WORK. LWORK >= max(1,N). - For optimum performance LWORK >= N*NB, where NB is - the optimal blocksize. - - If LWORK = -1, then a workspace query is assumed; the routine - only calculates the optimal size of the WORK array, returns - this value as the first entry of the WORK array, and no error - message related to LWORK is issued by XERBLA. + WORK (workspace) REAL array, dimension (N) INFO (output) INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value + = 0: successful exit + < 0: if INFO = -i, the i-th argument had an illegal value Further Details =============== @@ -4263,68 +4818,218 @@ static real c_b2863 = 2.f; /* Function Body */ *info = 0; - nb = ilaenv_(&c__1, "SGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen) - 1); - lwkopt = *n * nb; - work[1] = (real) lwkopt; - lquery = *lwork == -1; if (*m < 0) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*lda < max(1,*m)) { *info = -4; - } else if (*lwork < max(1,*n) && ! lquery) { - *info = -7; } if (*info != 0) { i__1 = -(*info); - xerbla_("SGEQRF", &i__1); - return 0; - } else if (lquery) { + xerbla_("SGEQR2", &i__1); return 0; } -/* Quick return if possible */ - k = min(*m,*n); - if (k == 0) { - work[1] = 1.f; - return 0; - } - - nbmin = 2; - nx = 0; - iws = *n; - if (nb > 1 && nb < k) { - -/* - Determine when to cross over from blocked to unblocked code. - Computing MAX -*/ - i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQRF", " ", m, n, &c_n1, &c_n1, ( - ftnlen)6, (ftnlen)1); - nx = max(i__1,i__2); - if (nx < k) { + i__1 = k; + for (i__ = 1; i__ <= i__1; ++i__) { -/* Determine if workspace is large enough for blocked code. */ +/* Generate elementary reflector H(i) to annihilate A(i+1:m,i) */ - ldwork = *n; - iws = ldwork * nb; - if (*lwork < iws) { + i__2 = *m - i__ + 1; +/* Computing MIN */ + i__3 = i__ + 1; + slarfg_(&i__2, &a[i__ + i__ * a_dim1], &a[min(i__3,*m) + i__ * a_dim1] + , &c__1, &tau[i__]); + if (i__ < *n) { -/* - Not enough workspace to use optimal NB: reduce NB and - determine the minimum value of NB. -*/ +/* Apply H(i) to A(i:m,i+1:n) from the left */ - nb = *lwork / ldwork; -/* Computing MAX */ - i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQRF", " ", m, n, &c_n1, & - c_n1, (ftnlen)6, (ftnlen)1); - nbmin = max(i__1,i__2); - } + aii = a[i__ + i__ * a_dim1]; + a[i__ + i__ * a_dim1] = 1.f; + i__2 = *m - i__ + 1; + i__3 = *n - i__; + slarf_("Left", &i__2, &i__3, &a[i__ + i__ * a_dim1], &c__1, &tau[ + i__], &a[i__ + (i__ + 1) * a_dim1], lda, &work[1]); + a[i__ + i__ * a_dim1] = aii; + } +/* L10: */ + } + return 0; + +/* End of SGEQR2 */ + +} /* sgeqr2_ */ + +/* Subroutine */ int sgeqrf_(integer *m, integer *n, real *a, integer *lda, + real *tau, real *work, integer *lwork, integer *info) +{ + /* System generated locals */ + integer a_dim1, a_offset, i__1, i__2, i__3, i__4; + + /* Local variables */ + static integer i__, k, ib, nb, nx, iws, nbmin, iinfo; + extern /* Subroutine */ int sgeqr2_(integer *, integer *, real *, integer + *, real *, real *, integer *), slarfb_(char *, char *, char *, + char *, integer *, integer *, integer *, real *, integer *, real * + , integer *, real *, integer *, real *, integer *), xerbla_(char *, integer *); + extern integer ilaenv_(integer *, char *, char *, integer *, integer *, + integer *, integer *, ftnlen, ftnlen); + extern /* Subroutine */ int slarft_(char *, char *, integer *, integer *, + real *, integer *, real *, real *, integer *); + static integer ldwork, lwkopt; + static logical lquery; + + +/* + -- LAPACK routine (version 3.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + November 2006 + + + Purpose + ======= + + SGEQRF computes a QR factorization of a real M-by-N matrix A: + A = Q * R. + + Arguments + ========= + + M (input) INTEGER + The number of rows of the matrix A. M >= 0. + + N (input) INTEGER + The number of columns of the matrix A. N >= 0. + + A (input/output) REAL array, dimension (LDA,N) + On entry, the M-by-N matrix A. + On exit, the elements on and above the diagonal of the array + contain the min(M,N)-by-N upper trapezoidal matrix R (R is + upper triangular if m >= n); the elements below the diagonal, + with the array TAU, represent the orthogonal matrix Q as a + product of min(m,n) elementary reflectors (see Further + Details). + + LDA (input) INTEGER + The leading dimension of the array A. LDA >= max(1,M). + + TAU (output) REAL array, dimension (min(M,N)) + The scalar factors of the elementary reflectors (see Further + Details). + + WORK (workspace/output) REAL array, dimension (MAX(1,LWORK)) + On exit, if INFO = 0, WORK(1) returns the optimal LWORK. + + LWORK (input) INTEGER + The dimension of the array WORK. LWORK >= max(1,N). + For optimum performance LWORK >= N*NB, where NB is + the optimal blocksize. + + If LWORK = -1, then a workspace query is assumed; the routine + only calculates the optimal size of the WORK array, returns + this value as the first entry of the WORK array, and no error + message related to LWORK is issued by XERBLA. + + INFO (output) INTEGER + = 0: successful exit + < 0: if INFO = -i, the i-th argument had an illegal value + + Further Details + =============== + + The matrix Q is represented as a product of elementary reflectors + + Q = H(1) H(2) . . . H(k), where k = min(m,n). + + Each H(i) has the form + + H(i) = I - tau * v * v' + + where tau is a real scalar, and v is a real vector with + v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i), + and tau in TAU(i). + + ===================================================================== + + + Test the input arguments +*/ + + /* Parameter adjustments */ + a_dim1 = *lda; + a_offset = 1 + a_dim1; + a -= a_offset; + --tau; + --work; + + /* Function Body */ + *info = 0; + nb = ilaenv_(&c__1, "SGEQRF", " ", m, n, &c_n1, &c_n1, (ftnlen)6, (ftnlen) + 1); + lwkopt = *n * nb; + work[1] = (real) lwkopt; + lquery = *lwork == -1; + if (*m < 0) { + *info = -1; + } else if (*n < 0) { + *info = -2; + } else if (*lda < max(1,*m)) { + *info = -4; + } else if (*lwork < max(1,*n) && ! lquery) { + *info = -7; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SGEQRF", &i__1); + return 0; + } else if (lquery) { + return 0; + } + +/* Quick return if possible */ + + k = min(*m,*n); + if (k == 0) { + work[1] = 1.f; + return 0; + } + + nbmin = 2; + nx = 0; + iws = *n; + if (nb > 1 && nb < k) { + +/* + Determine when to cross over from blocked to unblocked code. + + Computing MAX +*/ + i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQRF", " ", m, n, &c_n1, &c_n1, ( + ftnlen)6, (ftnlen)1); + nx = max(i__1,i__2); + if (nx < k) { + +/* Determine if workspace is large enough for blocked code. */ + + ldwork = *n; + iws = ldwork * nb; + if (*lwork < iws) { + +/* + Not enough workspace to use optimal NB: reduce NB and + determine the minimum value of NB. +*/ + + nb = *lwork / ldwork; +/* Computing MAX */ + i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQRF", " ", m, n, &c_n1, & + c_n1, (ftnlen)6, (ftnlen)1); + nbmin = max(i__1,i__2); + } } } @@ -14248,6 +14953,1389 @@ logical slaisnan_(real *sin1, real *sin2) #undef ci +/* Subroutine */ int slals0_(integer *icompq, integer *nl, integer *nr, + integer *sqre, integer *nrhs, real *b, integer *ldb, real *bx, + integer *ldbx, integer *perm, integer *givptr, integer *givcol, + integer *ldgcol, real *givnum, integer *ldgnum, real *poles, real * + difl, real *difr, real *z__, integer *k, real *c__, real *s, real * + work, integer *info) +{ + /* System generated locals */ + integer givcol_dim1, givcol_offset, b_dim1, b_offset, bx_dim1, bx_offset, + difr_dim1, difr_offset, givnum_dim1, givnum_offset, poles_dim1, + poles_offset, i__1, i__2; + real r__1; + + /* Local variables */ + static integer i__, j, m, n; + static real dj; + static integer nlp1; + static real temp; + extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + integer *, real *, real *); + extern doublereal snrm2_(integer *, real *, integer *); + static real diflj, difrj, dsigj; + extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *), + sgemv_(char *, integer *, integer *, real *, real *, integer *, + real *, integer *, real *, real *, integer *), scopy_( + integer *, real *, integer *, real *, integer *); + extern doublereal slamc3_(real *, real *); + extern /* Subroutine */ int xerbla_(char *, integer *); + static real dsigjp; + extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, + real *, integer *, integer *, real *, integer *, integer *), slacpy_(char *, integer *, integer *, real *, integer *, + real *, integer *); + + +/* + -- LAPACK routine (version 3.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + November 2006 + + + Purpose + ======= + + SLALS0 applies back the multiplying factors of either the left or the + right singular vector matrix of a diagonal matrix appended by a row + to the right hand side matrix B in solving the least squares problem + using the divide-and-conquer SVD approach. + + For the left singular vector matrix, three types of orthogonal + matrices are involved: + + (1L) Givens rotations: the number of such rotations is GIVPTR; the + pairs of columns/rows they were applied to are stored in GIVCOL; + and the C- and S-values of these rotations are stored in GIVNUM. + + (2L) Permutation. The (NL+1)-st row of B is to be moved to the first + row, and for J=2:N, PERM(J)-th row of B is to be moved to the + J-th row. + + (3L) The left singular vector matrix of the remaining matrix. + + For the right singular vector matrix, four types of orthogonal + matrices are involved: + + (1R) The right singular vector matrix of the remaining matrix. + + (2R) If SQRE = 1, one extra Givens rotation to generate the right + null space. + + (3R) The inverse transformation of (2L). + + (4R) The inverse transformation of (1L). + + Arguments + ========= + + ICOMPQ (input) INTEGER + Specifies whether singular vectors are to be computed in + factored form: + = 0: Left singular vector matrix. + = 1: Right singular vector matrix. + + NL (input) INTEGER + The row dimension of the upper block. NL >= 1. + + NR (input) INTEGER + The row dimension of the lower block. NR >= 1. + + SQRE (input) INTEGER + = 0: the lower block is an NR-by-NR square matrix. + = 1: the lower block is an NR-by-(NR+1) rectangular matrix. + + The bidiagonal matrix has row dimension N = NL + NR + 1, + and column dimension M = N + SQRE. + + NRHS (input) INTEGER + The number of columns of B and BX. NRHS must be at least 1. + + B (input/output) REAL array, dimension ( LDB, NRHS ) + On input, B contains the right hand sides of the least + squares problem in rows 1 through M. On output, B contains + the solution X in rows 1 through N. + + LDB (input) INTEGER + The leading dimension of B. LDB must be at least + max(1,MAX( M, N ) ). + + BX (workspace) REAL array, dimension ( LDBX, NRHS ) + + LDBX (input) INTEGER + The leading dimension of BX. + + PERM (input) INTEGER array, dimension ( N ) + The permutations (from deflation and sorting) applied + to the two blocks. + + GIVPTR (input) INTEGER + The number of Givens rotations which took place in this + subproblem. + + GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 ) + Each pair of numbers indicates a pair of rows/columns + involved in a Givens rotation. + + LDGCOL (input) INTEGER + The leading dimension of GIVCOL, must be at least N. + + GIVNUM (input) REAL array, dimension ( LDGNUM, 2 ) + Each number indicates the C or S value used in the + corresponding Givens rotation. + + LDGNUM (input) INTEGER + The leading dimension of arrays DIFR, POLES and + GIVNUM, must be at least K. + + POLES (input) REAL array, dimension ( LDGNUM, 2 ) + On entry, POLES(1:K, 1) contains the new singular + values obtained from solving the secular equation, and + POLES(1:K, 2) is an array containing the poles in the secular + equation. + + DIFL (input) REAL array, dimension ( K ). + On entry, DIFL(I) is the distance between I-th updated + (undeflated) singular value and the I-th (undeflated) old + singular value. + + DIFR (input) REAL array, dimension ( LDGNUM, 2 ). + On entry, DIFR(I, 1) contains the distances between I-th + updated (undeflated) singular value and the I+1-th + (undeflated) old singular value. And DIFR(I, 2) is the + normalizing factor for the I-th right singular vector. + + Z (input) REAL array, dimension ( K ) + Contain the components of the deflation-adjusted updating row + vector. + + K (input) INTEGER + Contains the dimension of the non-deflated matrix, + This is the order of the related secular equation. 1 <= K <=N. + + C (input) REAL + C contains garbage if SQRE =0 and the C-value of a Givens + rotation related to the right null space if SQRE = 1. + + S (input) REAL + S contains garbage if SQRE =0 and the S-value of a Givens + rotation related to the right null space if SQRE = 1. + + WORK (workspace) REAL array, dimension ( K ) + + INFO (output) INTEGER + = 0: successful exit. + < 0: if INFO = -i, the i-th argument had an illegal value. + + Further Details + =============== + + Based on contributions by + Ming Gu and Ren-Cang Li, Computer Science Division, University of + California at Berkeley, USA + Osni Marques, LBNL/NERSC, USA + + ===================================================================== + + + Test the input parameters. +*/ + + /* Parameter adjustments */ + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + bx_dim1 = *ldbx; + bx_offset = 1 + bx_dim1; + bx -= bx_offset; + --perm; + givcol_dim1 = *ldgcol; + givcol_offset = 1 + givcol_dim1; + givcol -= givcol_offset; + difr_dim1 = *ldgnum; + difr_offset = 1 + difr_dim1; + difr -= difr_offset; + poles_dim1 = *ldgnum; + poles_offset = 1 + poles_dim1; + poles -= poles_offset; + givnum_dim1 = *ldgnum; + givnum_offset = 1 + givnum_dim1; + givnum -= givnum_offset; + --difl; + --z__; + --work; + + /* Function Body */ + *info = 0; + + if (*icompq < 0 || *icompq > 1) { + *info = -1; + } else if (*nl < 1) { + *info = -2; + } else if (*nr < 1) { + *info = -3; + } else if (*sqre < 0 || *sqre > 1) { + *info = -4; + } + + n = *nl + *nr + 1; + + if (*nrhs < 1) { + *info = -5; + } else if (*ldb < n) { + *info = -7; + } else if (*ldbx < n) { + *info = -9; + } else if (*givptr < 0) { + *info = -11; + } else if (*ldgcol < n) { + *info = -13; + } else if (*ldgnum < n) { + *info = -15; + } else if (*k < 1) { + *info = -20; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SLALS0", &i__1); + return 0; + } + + m = n + *sqre; + nlp1 = *nl + 1; + + if (*icompq == 0) { + +/* + Apply back orthogonal transformations from the left. + + Step (1L): apply back the Givens rotations performed. +*/ + + i__1 = *givptr; + for (i__ = 1; i__ <= i__1; ++i__) { + srot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, & + b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ + + (givnum_dim1 << 1)], &givnum[i__ + givnum_dim1]); +/* L10: */ + } + +/* Step (2L): permute rows of B. */ + + scopy_(nrhs, &b[nlp1 + b_dim1], ldb, &bx[bx_dim1 + 1], ldbx); + i__1 = n; + for (i__ = 2; i__ <= i__1; ++i__) { + scopy_(nrhs, &b[perm[i__] + b_dim1], ldb, &bx[i__ + bx_dim1], + ldbx); +/* L20: */ + } + +/* + Step (3L): apply the inverse of the left singular vector + matrix to BX. +*/ + + if (*k == 1) { + scopy_(nrhs, &bx[bx_offset], ldbx, &b[b_offset], ldb); + if (z__[1] < 0.f) { + sscal_(nrhs, &c_b151, &b[b_offset], ldb); + } + } else { + i__1 = *k; + for (j = 1; j <= i__1; ++j) { + diflj = difl[j]; + dj = poles[j + poles_dim1]; + dsigj = -poles[j + (poles_dim1 << 1)]; + if (j < *k) { + difrj = -difr[j + difr_dim1]; + dsigjp = -poles[j + 1 + (poles_dim1 << 1)]; + } + if (z__[j] == 0.f || poles[j + (poles_dim1 << 1)] == 0.f) { + work[j] = 0.f; + } else { + work[j] = -poles[j + (poles_dim1 << 1)] * z__[j] / diflj / + (poles[j + (poles_dim1 << 1)] + dj); + } + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] == + 0.f) { + work[i__] = 0.f; + } else { + work[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__] + / (slamc3_(&poles[i__ + (poles_dim1 << 1)], & + dsigj) - diflj) / (poles[i__ + (poles_dim1 << + 1)] + dj); + } +/* L30: */ + } + i__2 = *k; + for (i__ = j + 1; i__ <= i__2; ++i__) { + if (z__[i__] == 0.f || poles[i__ + (poles_dim1 << 1)] == + 0.f) { + work[i__] = 0.f; + } else { + work[i__] = poles[i__ + (poles_dim1 << 1)] * z__[i__] + / (slamc3_(&poles[i__ + (poles_dim1 << 1)], & + dsigjp) + difrj) / (poles[i__ + (poles_dim1 << + 1)] + dj); + } +/* L40: */ + } + work[1] = -1.f; + temp = snrm2_(k, &work[1], &c__1); + sgemv_("T", k, nrhs, &c_b15, &bx[bx_offset], ldbx, &work[1], & + c__1, &c_b29, &b[j + b_dim1], ldb); + slascl_("G", &c__0, &c__0, &temp, &c_b15, &c__1, nrhs, &b[j + + b_dim1], ldb, info); +/* L50: */ + } + } + +/* Move the deflated rows of BX to B also. */ + + if (*k < max(m,n)) { + i__1 = n - *k; + slacpy_("A", &i__1, nrhs, &bx[*k + 1 + bx_dim1], ldbx, &b[*k + 1 + + b_dim1], ldb); + } + } else { + +/* + Apply back the right orthogonal transformations. + + Step (1R): apply back the new right singular vector matrix + to B. +*/ + + if (*k == 1) { + scopy_(nrhs, &b[b_offset], ldb, &bx[bx_offset], ldbx); + } else { + i__1 = *k; + for (j = 1; j <= i__1; ++j) { + dsigj = poles[j + (poles_dim1 << 1)]; + if (z__[j] == 0.f) { + work[j] = 0.f; + } else { + work[j] = -z__[j] / difl[j] / (dsigj + poles[j + + poles_dim1]) / difr[j + (difr_dim1 << 1)]; + } + i__2 = j - 1; + for (i__ = 1; i__ <= i__2; ++i__) { + if (z__[j] == 0.f) { + work[i__] = 0.f; + } else { + r__1 = -poles[i__ + 1 + (poles_dim1 << 1)]; + work[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difr[ + i__ + difr_dim1]) / (dsigj + poles[i__ + + poles_dim1]) / difr[i__ + (difr_dim1 << 1)]; + } +/* L60: */ + } + i__2 = *k; + for (i__ = j + 1; i__ <= i__2; ++i__) { + if (z__[j] == 0.f) { + work[i__] = 0.f; + } else { + r__1 = -poles[i__ + (poles_dim1 << 1)]; + work[i__] = z__[j] / (slamc3_(&dsigj, &r__1) - difl[ + i__]) / (dsigj + poles[i__ + poles_dim1]) / + difr[i__ + (difr_dim1 << 1)]; + } +/* L70: */ + } + sgemv_("T", k, nrhs, &c_b15, &b[b_offset], ldb, &work[1], & + c__1, &c_b29, &bx[j + bx_dim1], ldbx); +/* L80: */ + } + } + +/* + Step (2R): if SQRE = 1, apply back the rotation that is + related to the right null space of the subproblem. +*/ + + if (*sqre == 1) { + scopy_(nrhs, &b[m + b_dim1], ldb, &bx[m + bx_dim1], ldbx); + srot_(nrhs, &bx[bx_dim1 + 1], ldbx, &bx[m + bx_dim1], ldbx, c__, + s); + } + if (*k < max(m,n)) { + i__1 = n - *k; + slacpy_("A", &i__1, nrhs, &b[*k + 1 + b_dim1], ldb, &bx[*k + 1 + + bx_dim1], ldbx); + } + +/* Step (3R): permute rows of B. */ + + scopy_(nrhs, &bx[bx_dim1 + 1], ldbx, &b[nlp1 + b_dim1], ldb); + if (*sqre == 1) { + scopy_(nrhs, &bx[m + bx_dim1], ldbx, &b[m + b_dim1], ldb); + } + i__1 = n; + for (i__ = 2; i__ <= i__1; ++i__) { + scopy_(nrhs, &bx[i__ + bx_dim1], ldbx, &b[perm[i__] + b_dim1], + ldb); +/* L90: */ + } + +/* Step (4R): apply back the Givens rotations performed. */ + + for (i__ = *givptr; i__ >= 1; --i__) { + r__1 = -givnum[i__ + givnum_dim1]; + srot_(nrhs, &b[givcol[i__ + (givcol_dim1 << 1)] + b_dim1], ldb, & + b[givcol[i__ + givcol_dim1] + b_dim1], ldb, &givnum[i__ + + (givnum_dim1 << 1)], &r__1); +/* L100: */ + } + } + + return 0; + +/* End of SLALS0 */ + +} /* slals0_ */ + +/* Subroutine */ int slalsa_(integer *icompq, integer *smlsiz, integer *n, + integer *nrhs, real *b, integer *ldb, real *bx, integer *ldbx, real * + u, integer *ldu, real *vt, integer *k, real *difl, real *difr, real * + z__, real *poles, integer *givptr, integer *givcol, integer *ldgcol, + integer *perm, real *givnum, real *c__, real *s, real *work, integer * + iwork, integer *info) +{ + /* System generated locals */ + integer givcol_dim1, givcol_offset, perm_dim1, perm_offset, b_dim1, + b_offset, bx_dim1, bx_offset, difl_dim1, difl_offset, difr_dim1, + difr_offset, givnum_dim1, givnum_offset, poles_dim1, poles_offset, + u_dim1, u_offset, vt_dim1, vt_offset, z_dim1, z_offset, i__1, + i__2; + + /* Local variables */ + static integer i__, j, i1, ic, lf, nd, ll, nl, nr, im1, nlf, nrf, lvl, + ndb1, nlp1, lvl2, nrp1, nlvl, sqre, inode, ndiml; + extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, + integer *, real *, real *, integer *, real *, integer *, real *, + real *, integer *); + static integer ndimr; + extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + integer *), slals0_(integer *, integer *, integer *, integer *, + integer *, real *, integer *, real *, integer *, integer *, + integer *, integer *, integer *, real *, integer *, real *, real * + , real *, real *, integer *, real *, real *, real *, integer *), + xerbla_(char *, integer *), slasdt_(integer *, integer *, + integer *, integer *, integer *, integer *, integer *); + + +/* + -- LAPACK routine (version 3.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + November 2006 + + + Purpose + ======= + + SLALSA is an itermediate step in solving the least squares problem + by computing the SVD of the coefficient matrix in compact form (The + singular vectors are computed as products of simple orthorgonal + matrices.). + + If ICOMPQ = 0, SLALSA applies the inverse of the left singular vector + matrix of an upper bidiagonal matrix to the right hand side; and if + ICOMPQ = 1, SLALSA applies the right singular vector matrix to the + right hand side. The singular vector matrices were generated in + compact form by SLALSA. + + Arguments + ========= + + + ICOMPQ (input) INTEGER + Specifies whether the left or the right singular vector + matrix is involved. + = 0: Left singular vector matrix + = 1: Right singular vector matrix + + SMLSIZ (input) INTEGER + The maximum size of the subproblems at the bottom of the + computation tree. + + N (input) INTEGER + The row and column dimensions of the upper bidiagonal matrix. + + NRHS (input) INTEGER + The number of columns of B and BX. NRHS must be at least 1. + + B (input/output) REAL array, dimension ( LDB, NRHS ) + On input, B contains the right hand sides of the least + squares problem in rows 1 through M. + On output, B contains the solution X in rows 1 through N. + + LDB (input) INTEGER + The leading dimension of B in the calling subprogram. + LDB must be at least max(1,MAX( M, N ) ). + + BX (output) REAL array, dimension ( LDBX, NRHS ) + On exit, the result of applying the left or right singular + vector matrix to B. + + LDBX (input) INTEGER + The leading dimension of BX. + + U (input) REAL array, dimension ( LDU, SMLSIZ ). + On entry, U contains the left singular vector matrices of all + subproblems at the bottom level. + + LDU (input) INTEGER, LDU = > N. + The leading dimension of arrays U, VT, DIFL, DIFR, + POLES, GIVNUM, and Z. + + VT (input) REAL array, dimension ( LDU, SMLSIZ+1 ). + On entry, VT' contains the right singular vector matrices of + all subproblems at the bottom level. + + K (input) INTEGER array, dimension ( N ). + + DIFL (input) REAL array, dimension ( LDU, NLVL ). + where NLVL = INT(log_2 (N/(SMLSIZ+1))) + 1. + + DIFR (input) REAL array, dimension ( LDU, 2 * NLVL ). + On entry, DIFL(*, I) and DIFR(*, 2 * I -1) record + distances between singular values on the I-th level and + singular values on the (I -1)-th level, and DIFR(*, 2 * I) + record the normalizing factors of the right singular vectors + matrices of subproblems on I-th level. + + Z (input) REAL array, dimension ( LDU, NLVL ). + On entry, Z(1, I) contains the components of the deflation- + adjusted updating row vector for subproblems on the I-th + level. + + POLES (input) REAL array, dimension ( LDU, 2 * NLVL ). + On entry, POLES(*, 2 * I -1: 2 * I) contains the new and old + singular values involved in the secular equations on the I-th + level. + + GIVPTR (input) INTEGER array, dimension ( N ). + On entry, GIVPTR( I ) records the number of Givens + rotations performed on the I-th problem on the computation + tree. + + GIVCOL (input) INTEGER array, dimension ( LDGCOL, 2 * NLVL ). + On entry, for each I, GIVCOL(*, 2 * I - 1: 2 * I) records the + locations of Givens rotations performed on the I-th level on + the computation tree. + + LDGCOL (input) INTEGER, LDGCOL = > N. + The leading dimension of arrays GIVCOL and PERM. + + PERM (input) INTEGER array, dimension ( LDGCOL, NLVL ). + On entry, PERM(*, I) records permutations done on the I-th + level of the computation tree. + + GIVNUM (input) REAL array, dimension ( LDU, 2 * NLVL ). + On entry, GIVNUM(*, 2 *I -1 : 2 * I) records the C- and S- + values of Givens rotations performed on the I-th level on the + computation tree. + + C (input) REAL array, dimension ( N ). + On entry, if the I-th subproblem is not square, + C( I ) contains the C-value of a Givens rotation related to + the right null space of the I-th subproblem. + + S (input) REAL array, dimension ( N ). + On entry, if the I-th subproblem is not square, + S( I ) contains the S-value of a Givens rotation related to + the right null space of the I-th subproblem. + + WORK (workspace) REAL array. + The dimension must be at least N. + + IWORK (workspace) INTEGER array. + The dimension must be at least 3 * N + + INFO (output) INTEGER + = 0: successful exit. + < 0: if INFO = -i, the i-th argument had an illegal value. + + Further Details + =============== + + Based on contributions by + Ming Gu and Ren-Cang Li, Computer Science Division, University of + California at Berkeley, USA + Osni Marques, LBNL/NERSC, USA + + ===================================================================== + + + Test the input parameters. +*/ + + /* Parameter adjustments */ + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + bx_dim1 = *ldbx; + bx_offset = 1 + bx_dim1; + bx -= bx_offset; + givnum_dim1 = *ldu; + givnum_offset = 1 + givnum_dim1; + givnum -= givnum_offset; + poles_dim1 = *ldu; + poles_offset = 1 + poles_dim1; + poles -= poles_offset; + z_dim1 = *ldu; + z_offset = 1 + z_dim1; + z__ -= z_offset; + difr_dim1 = *ldu; + difr_offset = 1 + difr_dim1; + difr -= difr_offset; + difl_dim1 = *ldu; + difl_offset = 1 + difl_dim1; + difl -= difl_offset; + vt_dim1 = *ldu; + vt_offset = 1 + vt_dim1; + vt -= vt_offset; + u_dim1 = *ldu; + u_offset = 1 + u_dim1; + u -= u_offset; + --k; + --givptr; + perm_dim1 = *ldgcol; + perm_offset = 1 + perm_dim1; + perm -= perm_offset; + givcol_dim1 = *ldgcol; + givcol_offset = 1 + givcol_dim1; + givcol -= givcol_offset; + --c__; + --s; + --work; + --iwork; + + /* Function Body */ + *info = 0; + + if (*icompq < 0 || *icompq > 1) { + *info = -1; + } else if (*smlsiz < 3) { + *info = -2; + } else if (*n < *smlsiz) { + *info = -3; + } else if (*nrhs < 1) { + *info = -4; + } else if (*ldb < *n) { + *info = -6; + } else if (*ldbx < *n) { + *info = -8; + } else if (*ldu < *n) { + *info = -10; + } else if (*ldgcol < *n) { + *info = -19; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SLALSA", &i__1); + return 0; + } + +/* Book-keeping and setting up the computation tree. */ + + inode = 1; + ndiml = inode + *n; + ndimr = ndiml + *n; + + slasdt_(n, &nlvl, &nd, &iwork[inode], &iwork[ndiml], &iwork[ndimr], + smlsiz); + +/* + The following code applies back the left singular vector factors. + For applying back the right singular vector factors, go to 50. +*/ + + if (*icompq == 1) { + goto L50; + } + +/* + The nodes on the bottom level of the tree were solved + by SLASDQ. The corresponding left and right singular vector + matrices are in explicit form. First apply back the left + singular vector matrices. +*/ + + ndb1 = (nd + 1) / 2; + i__1 = nd; + for (i__ = ndb1; i__ <= i__1; ++i__) { + +/* + IC : center row of each node + NL : number of rows of left subproblem + NR : number of rows of right subproblem + NLF: starting row of the left subproblem + NRF: starting row of the right subproblem +*/ + + i1 = i__ - 1; + ic = iwork[inode + i1]; + nl = iwork[ndiml + i1]; + nr = iwork[ndimr + i1]; + nlf = ic - nl; + nrf = ic + 1; + sgemm_("T", "N", &nl, nrhs, &nl, &c_b15, &u[nlf + u_dim1], ldu, &b[ + nlf + b_dim1], ldb, &c_b29, &bx[nlf + bx_dim1], ldbx); + sgemm_("T", "N", &nr, nrhs, &nr, &c_b15, &u[nrf + u_dim1], ldu, &b[ + nrf + b_dim1], ldb, &c_b29, &bx[nrf + bx_dim1], ldbx); +/* L10: */ + } + +/* + Next copy the rows of B that correspond to unchanged rows + in the bidiagonal matrix to BX. +*/ + + i__1 = nd; + for (i__ = 1; i__ <= i__1; ++i__) { + ic = iwork[inode + i__ - 1]; + scopy_(nrhs, &b[ic + b_dim1], ldb, &bx[ic + bx_dim1], ldbx); +/* L20: */ + } + +/* + Finally go through the left singular vector matrices of all + the other subproblems bottom-up on the tree. +*/ + + j = pow_ii(&c__2, &nlvl); + sqre = 0; + + for (lvl = nlvl; lvl >= 1; --lvl) { + lvl2 = (lvl << 1) - 1; + +/* + find the first node LF and last node LL on + the current level LVL +*/ + + if (lvl == 1) { + lf = 1; + ll = 1; + } else { + i__1 = lvl - 1; + lf = pow_ii(&c__2, &i__1); + ll = (lf << 1) - 1; + } + i__1 = ll; + for (i__ = lf; i__ <= i__1; ++i__) { + im1 = i__ - 1; + ic = iwork[inode + im1]; + nl = iwork[ndiml + im1]; + nr = iwork[ndimr + im1]; + nlf = ic - nl; + nrf = ic + 1; + --j; + slals0_(icompq, &nl, &nr, &sqre, nrhs, &bx[nlf + bx_dim1], ldbx, & + b[nlf + b_dim1], ldb, &perm[nlf + lvl * perm_dim1], & + givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, & + givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 * + poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf + + lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[ + j], &s[j], &work[1], info); +/* L30: */ + } +/* L40: */ + } + goto L90; + +/* ICOMPQ = 1: applying back the right singular vector factors. */ + +L50: + +/* + First now go through the right singular vector matrices of all + the tree nodes top-down. +*/ + + j = 0; + i__1 = nlvl; + for (lvl = 1; lvl <= i__1; ++lvl) { + lvl2 = (lvl << 1) - 1; + +/* + Find the first node LF and last node LL on + the current level LVL. +*/ + + if (lvl == 1) { + lf = 1; + ll = 1; + } else { + i__2 = lvl - 1; + lf = pow_ii(&c__2, &i__2); + ll = (lf << 1) - 1; + } + i__2 = lf; + for (i__ = ll; i__ >= i__2; --i__) { + im1 = i__ - 1; + ic = iwork[inode + im1]; + nl = iwork[ndiml + im1]; + nr = iwork[ndimr + im1]; + nlf = ic - nl; + nrf = ic + 1; + if (i__ == ll) { + sqre = 0; + } else { + sqre = 1; + } + ++j; + slals0_(icompq, &nl, &nr, &sqre, nrhs, &b[nlf + b_dim1], ldb, &bx[ + nlf + bx_dim1], ldbx, &perm[nlf + lvl * perm_dim1], & + givptr[j], &givcol[nlf + lvl2 * givcol_dim1], ldgcol, & + givnum[nlf + lvl2 * givnum_dim1], ldu, &poles[nlf + lvl2 * + poles_dim1], &difl[nlf + lvl * difl_dim1], &difr[nlf + + lvl2 * difr_dim1], &z__[nlf + lvl * z_dim1], &k[j], &c__[ + j], &s[j], &work[1], info); +/* L60: */ + } +/* L70: */ + } + +/* + The nodes on the bottom level of the tree were solved + by SLASDQ. The corresponding right singular vector + matrices are in explicit form. Apply them back. +*/ + + ndb1 = (nd + 1) / 2; + i__1 = nd; + for (i__ = ndb1; i__ <= i__1; ++i__) { + i1 = i__ - 1; + ic = iwork[inode + i1]; + nl = iwork[ndiml + i1]; + nr = iwork[ndimr + i1]; + nlp1 = nl + 1; + if (i__ == nd) { + nrp1 = nr; + } else { + nrp1 = nr + 1; + } + nlf = ic - nl; + nrf = ic + 1; + sgemm_("T", "N", &nlp1, nrhs, &nlp1, &c_b15, &vt[nlf + vt_dim1], ldu, + &b[nlf + b_dim1], ldb, &c_b29, &bx[nlf + bx_dim1], ldbx); + sgemm_("T", "N", &nrp1, nrhs, &nrp1, &c_b15, &vt[nrf + vt_dim1], ldu, + &b[nrf + b_dim1], ldb, &c_b29, &bx[nrf + bx_dim1], ldbx); +/* L80: */ + } + +L90: + + return 0; + +/* End of SLALSA */ + +} /* slalsa_ */ + +/* Subroutine */ int slalsd_(char *uplo, integer *smlsiz, integer *n, integer + *nrhs, real *d__, real *e, real *b, integer *ldb, real *rcond, + integer *rank, real *work, integer *iwork, integer *info) +{ + /* System generated locals */ + integer b_dim1, b_offset, i__1, i__2; + real r__1; + + /* Local variables */ + static integer c__, i__, j, k; + static real r__; + static integer s, u, z__; + static real cs; + static integer bx; + static real sn; + static integer st, vt, nm1, st1; + static real eps; + static integer iwk; + static real tol; + static integer difl, difr; + static real rcnd; + static integer perm, nsub, nlvl, sqre, bxst; + extern /* Subroutine */ int srot_(integer *, real *, integer *, real *, + integer *, real *, real *), sgemm_(char *, char *, integer *, + integer *, integer *, real *, real *, integer *, real *, integer * + , real *, real *, integer *); + static integer poles, sizei, nsize; + extern /* Subroutine */ int scopy_(integer *, real *, integer *, real *, + integer *); + static integer nwork, icmpq1, icmpq2; + extern doublereal slamch_(char *); + extern /* Subroutine */ int slasda_(integer *, integer *, integer *, + integer *, real *, real *, real *, integer *, real *, integer *, + real *, real *, real *, real *, integer *, integer *, integer *, + integer *, real *, real *, real *, real *, integer *, integer *), + xerbla_(char *, integer *), slalsa_(integer *, integer *, + integer *, integer *, real *, integer *, real *, integer *, real * + , integer *, real *, integer *, real *, real *, real *, real *, + integer *, integer *, integer *, integer *, real *, real *, real * + , real *, integer *, integer *), slascl_(char *, integer *, + integer *, real *, real *, integer *, integer *, real *, integer * + , integer *); + static integer givcol; + extern integer isamax_(integer *, real *, integer *); + extern /* Subroutine */ int slasdq_(char *, integer *, integer *, integer + *, integer *, integer *, real *, real *, real *, integer *, real * + , integer *, real *, integer *, real *, integer *), + slacpy_(char *, integer *, integer *, real *, integer *, real *, + integer *), slartg_(real *, real *, real *, real *, real * + ), slaset_(char *, integer *, integer *, real *, real *, real *, + integer *); + static real orgnrm; + static integer givnum; + extern doublereal slanst_(char *, integer *, real *, real *); + extern /* Subroutine */ int slasrt_(char *, integer *, real *, integer *); + static integer givptr, smlszp; + + +/* + -- LAPACK routine (version 3.2.2) -- + -- LAPACK is a software package provided by Univ. of Tennessee, -- + -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- + June 2010 + + + Purpose + ======= + + SLALSD uses the singular value decomposition of A to solve the least + squares problem of finding X to minimize the Euclidean norm of each + column of A*X-B, where A is N-by-N upper bidiagonal, and X and B + are N-by-NRHS. The solution X overwrites B. + + The singular values of A smaller than RCOND times the largest + singular value are treated as zero in solving the least squares + problem; in this case a minimum norm solution is returned. + The actual singular values are returned in D in ascending order. + + This code makes very mild assumptions about floating point + arithmetic. It will work on machines with a guard digit in + add/subtract, or on those binary machines without guard digits + which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2. + It could conceivably fail on hexadecimal or decimal machines + without guard digits, but we know of none. + + Arguments + ========= + + UPLO (input) CHARACTER*1 + = 'U': D and E define an upper bidiagonal matrix. + = 'L': D and E define a lower bidiagonal matrix. + + SMLSIZ (input) INTEGER + The maximum size of the subproblems at the bottom of the + computation tree. + + N (input) INTEGER + The dimension of the bidiagonal matrix. N >= 0. + + NRHS (input) INTEGER + The number of columns of B. NRHS must be at least 1. + + D (input/output) REAL array, dimension (N) + On entry D contains the main diagonal of the bidiagonal + matrix. On exit, if INFO = 0, D contains its singular values. + + E (input/output) REAL array, dimension (N-1) + Contains the super-diagonal entries of the bidiagonal matrix. + On exit, E has been destroyed. + + B (input/output) REAL array, dimension (LDB,NRHS) + On input, B contains the right hand sides of the least + squares problem. On output, B contains the solution X. + + LDB (input) INTEGER + The leading dimension of B in the calling subprogram. + LDB must be at least max(1,N). + + RCOND (input) REAL + The singular values of A less than or equal to RCOND times + the largest singular value are treated as zero in solving + the least squares problem. If RCOND is negative, + machine precision is used instead. + For example, if diag(S)*X=B were the least squares problem, + where diag(S) is a diagonal matrix of singular values, the + solution would be X(i) = B(i) / S(i) if S(i) is greater than + RCOND*max(S), and X(i) = 0 if S(i) is less than or equal to + RCOND*max(S). + + RANK (output) INTEGER + The number of singular values of A greater than RCOND times + the largest singular value. + + WORK (workspace) REAL array, dimension at least + (9*N + 2*N*SMLSIZ + 8*N*NLVL + N*NRHS + (SMLSIZ+1)**2), + where NLVL = max(0, INT(log_2 (N/(SMLSIZ+1))) + 1). + + IWORK (workspace) INTEGER array, dimension at least + (3*N*NLVL + 11*N) + + INFO (output) INTEGER + = 0: successful exit. + < 0: if INFO = -i, the i-th argument had an illegal value. + > 0: The algorithm failed to compute a singular value while + working on the submatrix lying in rows and columns + INFO/(N+1) through MOD(INFO,N+1). + + Further Details + =============== + + Based on contributions by + Ming Gu and Ren-Cang Li, Computer Science Division, University of + California at Berkeley, USA + Osni Marques, LBNL/NERSC, USA + + ===================================================================== + + + Test the input parameters. +*/ + + /* Parameter adjustments */ + --d__; + --e; + b_dim1 = *ldb; + b_offset = 1 + b_dim1; + b -= b_offset; + --work; + --iwork; + + /* Function Body */ + *info = 0; + + if (*n < 0) { + *info = -3; + } else if (*nrhs < 1) { + *info = -4; + } else if (*ldb < 1 || *ldb < *n) { + *info = -8; + } + if (*info != 0) { + i__1 = -(*info); + xerbla_("SLALSD", &i__1); + return 0; + } + + eps = slamch_("Epsilon"); + +/* Set up the tolerance. */ + + if (*rcond <= 0.f || *rcond >= 1.f) { + rcnd = eps; + } else { + rcnd = *rcond; + } + + *rank = 0; + +/* Quick return if possible. */ + + if (*n == 0) { + return 0; + } else if (*n == 1) { + if (d__[1] == 0.f) { + slaset_("A", &c__1, nrhs, &c_b29, &c_b29, &b[b_offset], ldb); + } else { + *rank = 1; + slascl_("G", &c__0, &c__0, &d__[1], &c_b15, &c__1, nrhs, &b[ + b_offset], ldb, info); + d__[1] = dabs(d__[1]); + } + return 0; + } + +/* Rotate the matrix if it is lower bidiagonal. */ + + if (*(unsigned char *)uplo == 'L') { + i__1 = *n - 1; + for (i__ = 1; i__ <= i__1; ++i__) { + slartg_(&d__[i__], &e[i__], &cs, &sn, &r__); + d__[i__] = r__; + e[i__] = sn * d__[i__ + 1]; + d__[i__ + 1] = cs * d__[i__ + 1]; + if (*nrhs == 1) { + srot_(&c__1, &b[i__ + b_dim1], &c__1, &b[i__ + 1 + b_dim1], & + c__1, &cs, &sn); + } else { + work[(i__ << 1) - 1] = cs; + work[i__ * 2] = sn; + } +/* L10: */ + } + if (*nrhs > 1) { + i__1 = *nrhs; + for (i__ = 1; i__ <= i__1; ++i__) { + i__2 = *n - 1; + for (j = 1; j <= i__2; ++j) { + cs = work[(j << 1) - 1]; + sn = work[j * 2]; + srot_(&c__1, &b[j + i__ * b_dim1], &c__1, &b[j + 1 + i__ * + b_dim1], &c__1, &cs, &sn); +/* L20: */ + } +/* L30: */ + } + } + } + +/* Scale. */ + + nm1 = *n - 1; + orgnrm = slanst_("M", n, &d__[1], &e[1]); + if (orgnrm == 0.f) { + slaset_("A", n, nrhs, &c_b29, &c_b29, &b[b_offset], ldb); + return 0; + } + + slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, &c__1, &d__[1], n, info); + slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, &nm1, &c__1, &e[1], &nm1, + info); + +/* + If N is smaller than the minimum divide size SMLSIZ, then solve + the problem with another solver. +*/ + + if (*n <= *smlsiz) { + nwork = *n * *n + 1; + slaset_("A", n, n, &c_b29, &c_b15, &work[1], n); + slasdq_("U", &c__0, n, n, &c__0, nrhs, &d__[1], &e[1], &work[1], n, & + work[1], n, &b[b_offset], ldb, &work[nwork], info); + if (*info != 0) { + return 0; + } + tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1)); + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if (d__[i__] <= tol) { + slaset_("A", &c__1, nrhs, &c_b29, &c_b29, &b[i__ + b_dim1], + ldb); + } else { + slascl_("G", &c__0, &c__0, &d__[i__], &c_b15, &c__1, nrhs, &b[ + i__ + b_dim1], ldb, info); + ++(*rank); + } +/* L40: */ + } + sgemm_("T", "N", n, nrhs, n, &c_b15, &work[1], n, &b[b_offset], ldb, & + c_b29, &work[nwork], n); + slacpy_("A", n, nrhs, &work[nwork], n, &b[b_offset], ldb); + +/* Unscale. */ + + slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n, + info); + slasrt_("D", n, &d__[1], info); + slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, nrhs, &b[b_offset], + ldb, info); + + return 0; + } + +/* Book-keeping and setting up some constants. */ + + nlvl = (integer) (log((real) (*n) / (real) (*smlsiz + 1)) / log(2.f)) + 1; + + smlszp = *smlsiz + 1; + + u = 1; + vt = *smlsiz * *n + 1; + difl = vt + smlszp * *n; + difr = difl + nlvl * *n; + z__ = difr + (nlvl * *n << 1); + c__ = z__ + nlvl * *n; + s = c__ + *n; + poles = s + *n; + givnum = poles + (nlvl << 1) * *n; + bx = givnum + (nlvl << 1) * *n; + nwork = bx + *n * *nrhs; + + sizei = *n + 1; + k = sizei + *n; + givptr = k + *n; + perm = givptr + *n; + givcol = perm + nlvl * *n; + iwk = givcol + (nlvl * *n << 1); + + st = 1; + sqre = 0; + icmpq1 = 1; + icmpq2 = 0; + nsub = 0; + + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + if ((r__1 = d__[i__], dabs(r__1)) < eps) { + d__[i__] = r_sign(&eps, &d__[i__]); + } +/* L50: */ + } + + i__1 = nm1; + for (i__ = 1; i__ <= i__1; ++i__) { + if ((r__1 = e[i__], dabs(r__1)) < eps || i__ == nm1) { + ++nsub; + iwork[nsub] = st; + +/* + Subproblem found. First determine its size and then + apply divide and conquer on it. +*/ + + if (i__ < nm1) { + +/* A subproblem with E(I) small for I < NM1. */ + + nsize = i__ - st + 1; + iwork[sizei + nsub - 1] = nsize; + } else if ((r__1 = e[i__], dabs(r__1)) >= eps) { + +/* A subproblem with E(NM1) not too small but I = NM1. */ + + nsize = *n - st + 1; + iwork[sizei + nsub - 1] = nsize; + } else { + +/* + A subproblem with E(NM1) small. This implies an + 1-by-1 subproblem at D(N), which is not solved + explicitly. +*/ + + nsize = i__ - st + 1; + iwork[sizei + nsub - 1] = nsize; + ++nsub; + iwork[nsub] = *n; + iwork[sizei + nsub - 1] = 1; + scopy_(nrhs, &b[*n + b_dim1], ldb, &work[bx + nm1], n); + } + st1 = st - 1; + if (nsize == 1) { + +/* + This is a 1-by-1 subproblem and is not solved + explicitly. +*/ + + scopy_(nrhs, &b[st + b_dim1], ldb, &work[bx + st1], n); + } else if (nsize <= *smlsiz) { + +/* This is a small subproblem and is solved by SLASDQ. */ + + slaset_("A", &nsize, &nsize, &c_b29, &c_b15, &work[vt + st1], + n); + slasdq_("U", &c__0, &nsize, &nsize, &c__0, nrhs, &d__[st], &e[ + st], &work[vt + st1], n, &work[nwork], n, &b[st + + b_dim1], ldb, &work[nwork], info); + if (*info != 0) { + return 0; + } + slacpy_("A", &nsize, nrhs, &b[st + b_dim1], ldb, &work[bx + + st1], n); + } else { + +/* A large problem. Solve it using divide and conquer. */ + + slasda_(&icmpq1, smlsiz, &nsize, &sqre, &d__[st], &e[st], & + work[u + st1], n, &work[vt + st1], &iwork[k + st1], & + work[difl + st1], &work[difr + st1], &work[z__ + st1], + &work[poles + st1], &iwork[givptr + st1], &iwork[ + givcol + st1], n, &iwork[perm + st1], &work[givnum + + st1], &work[c__ + st1], &work[s + st1], &work[nwork], + &iwork[iwk], info); + if (*info != 0) { + return 0; + } + bxst = bx + st1; + slalsa_(&icmpq2, smlsiz, &nsize, nrhs, &b[st + b_dim1], ldb, & + work[bxst], n, &work[u + st1], n, &work[vt + st1], & + iwork[k + st1], &work[difl + st1], &work[difr + st1], + &work[z__ + st1], &work[poles + st1], &iwork[givptr + + st1], &iwork[givcol + st1], n, &iwork[perm + st1], & + work[givnum + st1], &work[c__ + st1], &work[s + st1], + &work[nwork], &iwork[iwk], info); + if (*info != 0) { + return 0; + } + } + st = i__ + 1; + } +/* L60: */ + } + +/* Apply the singular values and treat the tiny ones as zero. */ + + tol = rcnd * (r__1 = d__[isamax_(n, &d__[1], &c__1)], dabs(r__1)); + + i__1 = *n; + for (i__ = 1; i__ <= i__1; ++i__) { + +/* + Some of the elements in D can be negative because 1-by-1 + subproblems were not solved explicitly. +*/ + + if ((r__1 = d__[i__], dabs(r__1)) <= tol) { + slaset_("A", &c__1, nrhs, &c_b29, &c_b29, &work[bx + i__ - 1], n); + } else { + ++(*rank); + slascl_("G", &c__0, &c__0, &d__[i__], &c_b15, &c__1, nrhs, &work[ + bx + i__ - 1], n, info); + } + d__[i__] = (r__1 = d__[i__], dabs(r__1)); +/* L70: */ + } + +/* Now apply back the right singular vectors. */ + + icmpq2 = 1; + i__1 = nsub; + for (i__ = 1; i__ <= i__1; ++i__) { + st = iwork[i__]; + st1 = st - 1; + nsize = iwork[sizei + i__ - 1]; + bxst = bx + st1; + if (nsize == 1) { + scopy_(nrhs, &work[bxst], n, &b[st + b_dim1], ldb); + } else if (nsize <= *smlsiz) { + sgemm_("T", "N", &nsize, nrhs, &nsize, &c_b15, &work[vt + st1], n, + &work[bxst], n, &c_b29, &b[st + b_dim1], ldb); + } else { + slalsa_(&icmpq2, smlsiz, &nsize, nrhs, &work[bxst], n, &b[st + + b_dim1], ldb, &work[u + st1], n, &work[vt + st1], &iwork[ + k + st1], &work[difl + st1], &work[difr + st1], &work[z__ + + st1], &work[poles + st1], &iwork[givptr + st1], &iwork[ + givcol + st1], n, &iwork[perm + st1], &work[givnum + st1], + &work[c__ + st1], &work[s + st1], &work[nwork], &iwork[ + iwk], info); + if (*info != 0) { + return 0; + } + } +/* L80: */ + } + +/* Unscale and sort the singular values. */ + + slascl_("G", &c__0, &c__0, &c_b15, &orgnrm, n, &c__1, &d__[1], n, info); + slasrt_("D", n, &d__[1], info); + slascl_("G", &c__0, &c__0, &orgnrm, &c_b15, n, nrhs, &b[b_offset], ldb, + info); + + return 0; + +/* End of SLALSD */ + +} /* slalsd_ */ + /* Subroutine */ int slamrg_(integer *n1, integer *n2, real *a, integer * strd1, integer *strd2, integer *index) { @@ -29679,7 +31767,7 @@ doublereal slapy3_(real *x, real *y, real *z__) /* Note that M is very tiny */ if (l == 0.f) { - t = r_sign(&c_b2863, &ft) * r_sign(&c_b15, >); + t = r_sign(&c_b3178, &ft) * r_sign(&c_b15, >); } else { t = gt / r_sign(&d__, &ft) + m / t; } diff --git a/numpy/linalg/lapack_lite/wrapped_routines b/numpy/linalg/lapack_lite/wrapped_routines index e949890979d2..0d99c724d23f 100644 --- a/numpy/linalg/lapack_lite/wrapped_routines +++ b/numpy/linalg/lapack_lite/wrapped_routines @@ -1,5 +1,6 @@ ccopy cgeev +cgelsd cgemm cgesdd cgesv @@ -23,6 +24,7 @@ dpotrs dsyevd scopy sgeev +sgelsd sgemm sgesdd sgesv diff --git a/numpy/linalg/linalg.py b/numpy/linalg/linalg.py index de25d25e9a39..f073abadf505 100644 --- a/numpy/linalg/linalg.py +++ b/numpy/linalg/linalg.py @@ -97,6 +97,9 @@ def _raise_linalgerror_eigenvalues_nonconvergence(err, flag): def _raise_linalgerror_svd_nonconvergence(err, flag): raise LinAlgError("SVD did not converge") +def _raise_linalgerror_lstsq(err, flag): + raise LinAlgError("SVD did not converge in Linear Least Squares") + def get_linalg_error_extobj(callback): extobj = list(_linalg_error_extobj) # make a copy extobj[2] = callback @@ -1997,7 +2000,6 @@ def lstsq(a, b, rcond="warn"): >>> plt.show() """ - import math a, _ = _makearray(a) b, wrap = _makearray(b) is_1d = b.ndim == 1 @@ -2008,7 +2010,6 @@ def lstsq(a, b, rcond="warn"): m = a.shape[0] n = a.shape[1] n_rhs = b.shape[1] - ldb = max(n, m) if m != b.shape[0]: raise LinAlgError('Incompatible dimensions') @@ -2028,62 +2029,25 @@ def lstsq(a, b, rcond="warn"): FutureWarning, stacklevel=2) rcond = -1 if rcond is None: - rcond = finfo(t).eps * ldb - - bstar = zeros((ldb, n_rhs), t) - bstar[:m, :n_rhs] = b - a, bstar = _fastCopyAndTranspose(t, a, bstar) - a, bstar = _to_native_byte_order(a, bstar) - s = zeros((min(m, n),), real_t) - # This line: - # * is incorrect, according to the LAPACK documentation - # * raises a ValueError if min(m,n) == 0 - # * should not be calculated here anyway, as LAPACK should calculate - # `liwork` for us. But that only works if our version of lapack does - # not have this bug: - # http://icl.cs.utk.edu/lapack-forum/archives/lapack/msg00899.html - # Lapack_lite does have that bug... - nlvl = max( 0, int( math.log( float(min(m, n))/2. ) ) + 1 ) - iwork = zeros((3*min(m, n)*nlvl+11*min(m, n),), fortran_int) - if isComplexType(t): - lapack_routine = lapack_lite.zgelsd - lwork = 1 - rwork = zeros((lwork,), real_t) - work = zeros((lwork,), t) - results = lapack_routine(m, n, n_rhs, a, m, bstar, ldb, s, rcond, - 0, work, -1, rwork, iwork, 0) - lrwork = int(rwork[0]) - lwork = int(work[0].real) - work = zeros((lwork,), t) - rwork = zeros((lrwork,), real_t) - results = lapack_routine(m, n, n_rhs, a, m, bstar, ldb, s, rcond, - 0, work, lwork, rwork, iwork, 0) + rcond = finfo(t).eps * max(n, m) + + if m <= n: + gufunc = _umath_linalg.lstsq_m else: - lapack_routine = lapack_lite.dgelsd - lwork = 1 - work = zeros((lwork,), t) - results = lapack_routine(m, n, n_rhs, a, m, bstar, ldb, s, rcond, - 0, work, -1, iwork, 0) - lwork = int(work[0]) - work = zeros((lwork,), t) - results = lapack_routine(m, n, n_rhs, a, m, bstar, ldb, s, rcond, - 0, work, lwork, iwork, 0) - if results['info'] > 0: - raise LinAlgError('SVD did not converge in Linear Least Squares') - - # undo transpose imposed by fortran-order arrays - b_out = bstar.T + gufunc = _umath_linalg.lstsq_n + + signature = 'DDd->Did' if isComplexType(t) else 'ddd->did' + extobj = get_linalg_error_extobj(_raise_linalgerror_lstsq) + b_out, rank, s = gufunc(a, b, rcond, signature=signature, extobj=extobj) # b_out contains both the solution and the components of the residuals - x = b_out[:n,:] - r_parts = b_out[n:,:] + x = b_out[...,:n,:] + r_parts = b_out[...,n:,:] if isComplexType(t): resids = sum(abs(r_parts)**2, axis=-2) else: resids = sum(r_parts**2, axis=-2) - rank = results['rank'] - # remove the axis we added if is_1d: x = x.squeeze(axis=-1) diff --git a/numpy/linalg/umath_linalg.c.src b/numpy/linalg/umath_linalg.c.src index 0248518ac7cb..d8cfdf6acd43 100644 --- a/numpy/linalg/umath_linalg.c.src +++ b/numpy/linalg/umath_linalg.c.src @@ -129,12 +129,26 @@ FNAME(zheevd)(char *jobz, char *uplo, int *n, int *info); extern int +FNAME(sgelsd)(int *m, int *n, int *nrhs, + float a[], int *lda, float b[], int *ldb, + float s[], float *rcond, int *rank, + float work[], int *lwork, int iwork[], + int *info); +extern int FNAME(dgelsd)(int *m, int *n, int *nrhs, double a[], int *lda, double b[], int *ldb, double s[], double *rcond, int *rank, double work[], int *lwork, int iwork[], int *info); extern int +FNAME(cgelsd)(int *m, int *n, int *nrhs, + f2c_complex a[], int *lda, + f2c_complex b[], int *ldb, + float s[], float *rcond, int *rank, + f2c_complex work[], int *lwork, + float rwork[], int iwork[], + int *info); +extern int FNAME(zgelsd)(int *m, int *n, int *nrhs, f2c_doublecomplex a[], int *lda, f2c_doublecomplex b[], int *ldb, @@ -492,6 +506,7 @@ static void init_constants(void) * columns: number of columns in the matrix * row_strides: the number bytes between consecutive rows. * column_strides: the number of bytes between consecutive columns. + * output_lead_dim: BLAS/LAPACK-side leading dimension, in elements */ typedef struct linearize_data_struct { @@ -499,8 +514,24 @@ typedef struct linearize_data_struct npy_intp columns; npy_intp row_strides; npy_intp column_strides; + npy_intp output_lead_dim; } LINEARIZE_DATA_t; +static NPY_INLINE void +init_linearize_data_ex(LINEARIZE_DATA_t *lin_data, + npy_intp rows, + npy_intp columns, + npy_intp row_strides, + npy_intp column_strides, + npy_intp output_lead_dim) +{ + lin_data->rows = rows; + lin_data->columns = columns; + lin_data->row_strides = row_strides; + lin_data->column_strides = column_strides; + lin_data->output_lead_dim = output_lead_dim; +} + static NPY_INLINE void init_linearize_data(LINEARIZE_DATA_t *lin_data, npy_intp rows, @@ -508,10 +539,8 @@ init_linearize_data(LINEARIZE_DATA_t *lin_data, npy_intp row_strides, npy_intp column_strides) { - lin_data->rows = rows; - lin_data->columns = columns; - lin_data->row_strides = row_strides; - lin_data->column_strides = column_strides; + init_linearize_data_ex( + lin_data, rows, columns, row_strides, column_strides, columns); } static NPY_INLINE void @@ -846,7 +875,7 @@ linearize_@TYPE@_matrix(void *dst_in, } } src += data->row_strides/sizeof(@typ@); - dst += data->columns; + dst += data->output_lead_dim; } return rv; } else { @@ -893,7 +922,7 @@ delinearize_@TYPE@_matrix(void *dst_in, sizeof(@typ@)); } } - src += data->columns; + src += data->output_lead_dim; dst += data->row_strides/sizeof(@typ@); } @@ -2871,6 +2900,359 @@ static void /**end repeat**/ + +/* -------------------------------------------------------------------------- */ + /* least squares */ + +typedef struct gelsd_params_struct +{ + fortran_int M; + fortran_int N; + fortran_int NRHS; + void *A; + fortran_int LDA; + void *B; + fortran_int LDB; + void *S; + void *RCOND; + fortran_int RANK; + void *WORK; + fortran_int LWORK; + void *RWORK; + void *IWORK; +} GELSD_PARAMS_t; + + +static inline void +dump_gelsd_params(const char *name, + GELSD_PARAMS_t *params) +{ + TRACE_TXT("\n%s:\n"\ + + "%14s: %18p\n"\ + "%14s: %18p\n"\ + "%14s: %18p\n"\ + "%14s: %18p\n"\ + "%14s: %18p\n"\ + "%14s: %18p\n"\ + + "%14s: %18d\n"\ + "%14s: %18d\n"\ + "%14s: %18d\n"\ + "%14s: %18d\n"\ + "%14s: %18d\n"\ + "%14s: %18d\n"\ + "%14s: %18d\n"\ + + "%14s: %18p\n", + + name, + + "A", params->A, + "B", params->B, + "S", params->S, + "WORK", params->WORK, + "RWORK", params->RWORK, + "IWORK", params->IWORK, + + "M", (int)params->M, + "N", (int)params->N, + "NRHS", (int)params->NRHS, + "LDA", (int)params->LDA, + "LDB", (int)params->LDB, + "LWORK", (int)params->LWORK, + "RANK", (int)params->RANK, + + "RCOND", params->RCOND); +} + + +/**begin repeat + #TYPE=FLOAT,DOUBLE# + #lapack_func=sgelsd,dgelsd# + #ftyp=fortran_real,fortran_doublereal# + */ + +static inline fortran_int +call_@lapack_func@(GELSD_PARAMS_t *params) +{ + fortran_int rv; + LAPACK(@lapack_func@)(¶ms->M, ¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, + params->B, ¶ms->LDB, + params->S, + params->RCOND, ¶ms->RANK, + params->WORK, ¶ms->LWORK, + params->IWORK, + &rv); + return rv; +} + +static inline int +init_@lapack_func@(GELSD_PARAMS_t *params, + fortran_int m, + fortran_int n, + fortran_int nrhs) +{ + npy_uint8 *mem_buff = NULL; + npy_uint8 *mem_buff2 = NULL; + npy_uint8 *a, *b, *s, *work, *iwork; + fortran_int min_m_n = fortran_int_min(m, n); + fortran_int max_m_n = fortran_int_max(m, n); + size_t safe_min_m_n = min_m_n; + size_t safe_max_m_n = max_m_n; + size_t safe_m = m; + size_t safe_n = n; + size_t safe_nrhs = nrhs; + + size_t a_size = safe_m * safe_n * sizeof(@ftyp@); + size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@); + size_t s_size = safe_min_m_n * sizeof(@ftyp@); + + fortran_int work_count; + size_t work_size; + size_t iwork_size; + fortran_int lda = fortran_int_max(1, m); + fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n)); + + mem_buff = malloc(a_size + b_size + s_size); + + if (!mem_buff) + goto error; + + a = mem_buff; + b = a + a_size; + s = b + b_size; + + + params->M = m; + params->N = n; + params->NRHS = nrhs; + params->A = a; + params->B = b; + params->S = s; + params->LDA = lda; + params->LDB = ldb; + + { + /* compute optimal work size */ + @ftyp@ work_size_query; + fortran_int iwork_size_query; + + params->WORK = &work_size_query; + params->IWORK = &iwork_size_query; + params->RWORK = NULL; + params->LWORK = -1; + + if (call_@lapack_func@(params) != 0) + goto error; + + work_count = (fortran_int)work_size_query; + + work_size = (size_t) work_size_query * sizeof(@ftyp@); + iwork_size = (size_t)iwork_size_query * sizeof(fortran_int); + } + + mem_buff2 = malloc(work_size + iwork_size); + if (!mem_buff2) + goto error; + + work = mem_buff2; + iwork = work + work_size; + + params->WORK = work; + params->RWORK = NULL; + params->IWORK = iwork; + params->LWORK = work_count; + + return 1; + error: + TRACE_TXT("%s failed init\n", __FUNCTION__); + free(mem_buff); + free(mem_buff2); + memset(params, 0, sizeof(*params)); + + return 0; +} + +/**end repeat**/ + +/**begin repeat + #TYPE=CFLOAT,CDOUBLE# + #ftyp=fortran_complex,fortran_doublecomplex# + #frealtyp=fortran_real,fortran_doublereal# + #typ=COMPLEX_t,DOUBLECOMPLEX_t# + #lapack_func=cgelsd,zgelsd# + */ + +static inline fortran_int +call_@lapack_func@(GELSD_PARAMS_t *params) +{ + fortran_int rv; + LAPACK(@lapack_func@)(¶ms->M, ¶ms->N, ¶ms->NRHS, + params->A, ¶ms->LDA, + params->B, ¶ms->LDB, + params->S, + params->RCOND, ¶ms->RANK, + params->WORK, ¶ms->LWORK, + params->RWORK, params->IWORK, + &rv); + return rv; +} + +static inline int +init_@lapack_func@(GELSD_PARAMS_t *params, + fortran_int m, + fortran_int n, + fortran_int nrhs) +{ + npy_uint8 *mem_buff = NULL; + npy_uint8 *mem_buff2 = NULL; + npy_uint8 *a, *b, *s, *work, *iwork, *rwork; + fortran_int min_m_n = fortran_int_min(m, n); + fortran_int max_m_n = fortran_int_max(m, n); + size_t safe_min_m_n = min_m_n; + size_t safe_max_m_n = max_m_n; + size_t safe_m = m; + size_t safe_n = n; + size_t safe_nrhs = nrhs; + + size_t a_size = safe_m * safe_n * sizeof(@ftyp@); + size_t b_size = safe_max_m_n * safe_nrhs * sizeof(@ftyp@); + size_t s_size = safe_min_m_n * sizeof(@frealtyp@); + + fortran_int work_count; + size_t work_size, rwork_size, iwork_size; + fortran_int lda = fortran_int_max(1, m); + fortran_int ldb = fortran_int_max(1, fortran_int_max(m,n)); + + mem_buff = malloc(a_size + b_size + s_size); + + if (!mem_buff) + goto error; + + a = mem_buff; + b = a + a_size; + s = b + b_size; + + + params->M = m; + params->N = n; + params->NRHS = nrhs; + params->A = a; + params->B = b; + params->S = s; + params->LDA = lda; + params->LDB = ldb; + + { + /* compute optimal work size */ + @ftyp@ work_size_query; + @frealtyp@ rwork_size_query; + fortran_int iwork_size_query; + + params->WORK = &work_size_query; + params->IWORK = &iwork_size_query; + params->RWORK = &rwork_size_query; + params->LWORK = -1; + + if (call_@lapack_func@(params) != 0) + goto error; + + work_count = (fortran_int)work_size_query.r; + + work_size = (size_t )work_size_query.r * sizeof(@ftyp@); + rwork_size = (size_t)rwork_size_query * sizeof(@frealtyp@); + iwork_size = (size_t)iwork_size_query * sizeof(fortran_int); + } + + mem_buff2 = malloc(work_size + rwork_size + iwork_size); + if (!mem_buff2) + goto error; + + work = mem_buff2; + rwork = work + work_size; + iwork = rwork + rwork_size; + + params->WORK = work; + params->RWORK = rwork; + params->IWORK = iwork; + params->LWORK = work_count; + + return 1; + error: + TRACE_TXT("%s failed init\n", __FUNCTION__); + free(mem_buff); + free(mem_buff2); + memset(params, 0, sizeof(*params)); + + return 0; +} + +/**end repeat**/ + + +/**begin repeat + #TYPE=FLOAT,DOUBLE,CFLOAT,CDOUBLE# + #REALTYPE=FLOAT,DOUBLE,FLOAT,DOUBLE# + #lapack_func=sgelsd,dgelsd,cgelsd,zgelsd# + */ +static inline void +release_@lapack_func@(GELSD_PARAMS_t* params) +{ + /* A and WORK contain allocated blocks */ + free(params->A); + free(params->WORK); + memset(params, 0, sizeof(*params)); +} + +static void +@TYPE@_lstsq(char **args, npy_intp *dimensions, npy_intp *steps, + void *NPY_UNUSED(func)) +{ + GELSD_PARAMS_t params; + int error_occurred = get_fp_invalid_and_clear(); + fortran_int n, m, nrhs; + INIT_OUTER_LOOP_6 + + m = (fortran_int)dimensions[0]; + n = (fortran_int)dimensions[1]; + nrhs = (fortran_int)dimensions[2]; + + if (init_@lapack_func@(¶ms, m, n, nrhs)) { + LINEARIZE_DATA_t a_in, b_in, x_out, s_out; + + init_linearize_data(&a_in, n, m, steps[1], steps[0]); + init_linearize_data_ex(&b_in, nrhs, m, steps[3], steps[2], fortran_int_max(n, m)); + init_linearize_data(&x_out, nrhs, fortran_int_max(n, m), steps[5], steps[4]); + init_linearize_data(&s_out, 1, fortran_int_min(n, m), 1, steps[6]); + + BEGIN_OUTER_LOOP_6 + int not_ok; + linearize_@TYPE@_matrix(params.A, args[0], &a_in); + linearize_@TYPE@_matrix(params.B, args[1], &b_in); + params.RCOND = args[2]; + not_ok = call_@lapack_func@(¶ms); + if (!not_ok) { + delinearize_@TYPE@_matrix(args[3], params.B, &x_out); + *(npy_int*) args[4] = params.RANK; + delinearize_@REALTYPE@_matrix(args[5], params.S, &s_out); + } else { + error_occurred = 1; + nan_@TYPE@_matrix(args[3], &x_out); + *(npy_int*) args[4] = -1; + nan_@REALTYPE@_matrix(args[5], &s_out); + } + END_OUTER_LOOP + + release_@lapack_func@(¶ms); + } + + set_fp_invalid_or_clear(error_occurred); +} + +/**end repeat**/ + #pragma GCC diagnostic pop /* -------------------------------------------------------------------------- */ @@ -2941,6 +3323,7 @@ GUFUNC_FUNC_ARRAY_REAL_COMPLEX(cholesky_lo); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_N); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_S); GUFUNC_FUNC_ARRAY_REAL_COMPLEX(svd_A); +GUFUNC_FUNC_ARRAY_REAL_COMPLEX(lstsq); GUFUNC_FUNC_ARRAY_EIG(eig); GUFUNC_FUNC_ARRAY_EIG(eigvals); @@ -3006,6 +3389,14 @@ static char svd_1_3_types[] = { NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE }; +/* A, b, rcond, x, rank, s */ +static char lstsq_types[] = { + NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_INT, NPY_FLOAT, + NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_INT, NPY_DOUBLE, + NPY_CFLOAT, NPY_CFLOAT, NPY_FLOAT, NPY_CFLOAT, NPY_INT, NPY_FLOAT, + NPY_CDOUBLE, NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE, NPY_INT, NPY_DOUBLE +}; + typedef struct gufunc_descriptor_struct { char *name; char *signature; @@ -3192,12 +3583,29 @@ GUFUNC_DESCRIPTOR_t gufunc_descriptors [] = { "eigvals", "(m,m)->(m)", "eigvals on the last two dimension and broadcast to the rest. \n"\ - "Results in a vector of eigenvalues. \n"\ - " \"(m,m)->(m),(m,m)\" \n", + "Results in a vector of eigenvalues. \n", 3, 1, 1, FUNC_ARRAY_NAME(eigvals), eigvals_types }, + { + "lstsq_m", + "(m,n),(m,nrhs),()->(n,nrhs),(),(m)", + "least squares on the last two dimensions and broadcast to the rest. \n"\ + "For m <= n. \n", + 4, 3, 3, + FUNC_ARRAY_NAME(lstsq), + lstsq_types + }, + { + "lstsq_n", + "(m,n),(m,nrhs),()->(m,nrhs),(),(n)", + "least squares on the last two dimensions and broadcast to the rest. \n"\ + "For m >= n. \n", + 4, 3, 3, + FUNC_ARRAY_NAME(lstsq), + lstsq_types + } }; static void