diff --git a/base/abstractarray.jl b/base/abstractarray.jl index c496e0ec9ce56..5707fe9918afd 100644 --- a/base/abstractarray.jl +++ b/base/abstractarray.jl @@ -1437,6 +1437,51 @@ prod(A::AbstractArray{Bool}) = prod(A::AbstractArray{Bool}, region) = error("use all() instead of prod() for boolean arrays") + +# a fast implementation of sum in sequential order (from left to right) +function sum_seq{T}(a::AbstractArray{T}, ifirst::Int, ilast::Int) + + @inbounds if ifirst + 3 >= ilast # a has at most four elements + s = zero(T) + i = ifirst + while i <= ilast + s += a[i] + i += 1 + end + return s + + else # a has more than four elements + + # more effective utilization of the instruction + # pipeline through manually unrolling the sum + # into four-way accumulation. Benchmark shows + # that this results in about 2x speed-up. + + s1 = a[ifirst] + s2 = a[ifirst + 1] + s3 = a[ifirst + 2] + s4 = a[ifirst + 3] + + i = ifirst + 4 + il = ilast - 3 + while i <= il + s1 += a[i] + s2 += a[i+1] + s3 += a[i+2] + s4 += a[i+3] + i += 4 + end + + while i <= ilast + s1 += a[i] + i += 1 + end + + return s1 + s2 + s3 + s4 + end +end + + # Pairwise (cascade) summation of A[i1:i1+n-1], which has O(log n) error growth # [vs O(n) for a simple loop] with negligible performance cost if # the base case is large enough. See, e.g.: @@ -1448,23 +1493,25 @@ prod(A::AbstractArray{Bool}, region) = # in practice. See: # Manfred Tasche and Hansmartin Zeuner, Handbook of # Analytic-Computational Methods in Applied Mathematics (2000). -function sum_pairwise(A::AbstractArray, i1,n) - if n < 128 - @inbounds s = A[i1] - for i = i1+1:i1+n-1 - @inbounds s += A[i] - end - return s +# + +# Note: sum_seq uses four accumulators, so each accumulator gets at most 256 numbers +const PAIRWISE_SUM_BLOCKSIZE = 1024 + +function sum_pairwise(a::AbstractArray, ifirst::Int, ilast::Int) + # bsiz: maximum block size + + if ifirst + PAIRWISE_SUM_BLOCKSIZE >= ilast + sum_seq(a, ifirst, ilast) else - n2 = div(n,2) - return sum_pairwise(A, i1, n2) + sum_pairwise(A, i1+n2, n-n2) + imid = (ifirst + ilast) >>> 1 + sum_pairwise(a, ifirst, imid) + sum_pairwise(a, imid+1, ilast) end end -function sum{T}(A::AbstractArray{T}) - n = length(A) - n == 0 ? zero(T) : sum_pairwise(A, 1, n) -end +sum(a::AbstractArray) = sum_pairwise(a, 1, length(a)) +sum{T<:Integer}(a::AbstractArray{T}) = sum_seq(a, 1, length(a)) + # Kahan (compensated) summation: O(1) error growth, at the expense # of a considerable increase in computational expense. diff --git a/test/arrayops.jl b/test/arrayops.jl index 809db68a8da26..2f8fee50ae1da 100644 --- a/test/arrayops.jl +++ b/test/arrayops.jl @@ -351,7 +351,7 @@ let es = sum_kbn(z), es2 = sum_kbn(z[1:10^5]) @test (es - cs[end]) < es * 1e-13 @test (es2 - cs[10^5]) < es2 * 1e-13 end -@test sum(sin(z)) == sum(sin, z) +@test_approx_eq sum(sin(z)) sum(sin, z) @test any([true false; false false], 2) == [true false]' @test any([true false; false false], 1) == [true false]