@@ -2447,9 +2447,8 @@ Since lo**2 is less than 1/2 ulp(csum), we have csum+lo*lo == csum.
2447
2447
To minimize loss of information during the accumulation of fractional
2448
2448
values, each term has a separate accumulator. This also breaks up
2449
2449
sequential dependencies in the inner loop so the CPU can maximize
2450
- floating point throughput. [4] On a 2.6 GHz Haswell, adding one
2451
- dimension has an incremental cost of only 5ns -- for example when
2452
- moving from hypot(x,y) to hypot(x,y,z).
2450
+ floating point throughput. [4] On an Apple M1 Max, hypot(*vec)
2451
+ takes only 3.33 µsec when len(vec) == 1000.
2453
2452
2454
2453
The square root differential correction is needed because a
2455
2454
correctly rounded square root of a correctly rounded sum of
@@ -2473,7 +2472,7 @@ step is exact. The Neumaier summation computes as if in doubled
2473
2472
precision (106 bits) and has the advantage that its input squares
2474
2473
are non-negative so that the condition number of the sum is one.
2475
2474
The square root with a differential correction is likewise computed
2476
- as if in double precision.
2475
+ as if in doubled precision.
2477
2476
2478
2477
For n <= 1000, prior to the final addition that rounds the overall
2479
2478
result, the internal accuracy of "h" together with its correction of
@@ -2514,12 +2513,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
2514
2513
}
2515
2514
frexp (max , & max_e );
2516
2515
if (max_e < -1023 ) {
2517
- /* When max_e < -1023, ldexp(1.0, -max_e) would overflow.
2518
- So we first perform lossless scaling from subnormals back to normals,
2519
- then recurse back to vector_norm(), and then finally undo the scaling.
2520
- */
2516
+ /* When max_e < -1023, ldexp(1.0, -max_e) would overflow. */
2521
2517
for (i = 0 ; i < n ; i ++ ) {
2522
- vec [i ] /= DBL_MIN ;
2518
+ vec [i ] /= DBL_MIN ; // convert subnormals to normals
2523
2519
}
2524
2520
return DBL_MIN * vector_norm (n , vec , max / DBL_MIN , found_nan );
2525
2521
}
@@ -2529,17 +2525,14 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
2529
2525
for (i = 0 ; i < n ; i ++ ) {
2530
2526
x = vec [i ];
2531
2527
assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2532
-
2533
- x *= scale ;
2528
+ x *= scale ; // lossless scaling
2534
2529
assert (fabs (x ) < 1.0 );
2535
-
2536
- pr = dl_mul (x , x );
2530
+ pr = dl_mul (x , x ); // lossless squaring
2537
2531
assert (pr .hi <= 1.0 );
2538
-
2539
- sm = dl_fast_sum (csum , pr .hi );
2532
+ sm = dl_fast_sum (csum , pr .hi ); // lossless addition
2540
2533
csum = sm .hi ;
2541
- frac1 += pr .lo ;
2542
- frac2 += sm .lo ;
2534
+ frac1 += pr .lo ; // lossy addition
2535
+ frac2 += sm .lo ; // lossy addition
2543
2536
}
2544
2537
h = sqrt (csum - 1.0 + (frac1 + frac2 ));
2545
2538
pr = dl_mul (- h , h );
@@ -2548,7 +2541,8 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
2548
2541
frac1 += pr .lo ;
2549
2542
frac2 += sm .lo ;
2550
2543
x = csum - 1.0 + (frac1 + frac2 );
2551
- return (h + x / (2.0 * h )) / scale ;
2544
+ h += x / (2.0 * h ); // differential correction
2545
+ return h / scale ;
2552
2546
}
2553
2547
2554
2548
#define NUM_STACK_ELEMS 16
0 commit comments