@@ -40,7 +40,6 @@ abs_ptrdiff(char *a, char *b)
40
40
return (a > b ) ? (a - b ) : (b - a );
41
41
}
42
42
43
-
44
43
/*
45
44
* stride is equal to element size and input and destination are equal or
46
45
* don't overlap within one register. The check of the steps against
@@ -133,7 +132,7 @@ abs_ptrdiff(char *a, char *b)
133
132
*/
134
133
135
134
static void
136
- @ISA @_ @func @_FLOAT (npy_float * , npy_float * , const npy_int n );
135
+ @ISA @_ @func @_FLOAT (npy_float * , npy_float * , const npy_intp n );
137
136
138
137
/**end repeat1**/
139
138
#endif
@@ -1261,7 +1260,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
1261
1260
* #BYTES = 32, 64#
1262
1261
* #mask = __m256, __mmask16#
1263
1262
* #vsub = , _mask#
1264
- * #and_masks =_mm256_and_ps, _mm512_kand #
1263
+ * #or_masks =_mm256_or_ps, _mm512_kor #
1265
1264
* #fmadd = avx2_fmadd,_mm512_fmadd_ps#
1266
1265
* #mask_to_int = _mm256_movemask_ps, #
1267
1266
* #full_mask= 0xFF, 0xFFFF#
@@ -1287,7 +1286,7 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@
1287
1286
1288
1287
#if defined HAVE_ATTRIBUTE_TARGET_ @ISA @_WITH_INTRINSICS
1289
1288
static NPY_GCC_OPT_3 NPY_GCC_TARGET_ @ISA @ void
1290
- @ISA @_exp_FLOAT (npy_float * op , npy_float * ip , const npy_int array_size )
1289
+ @ISA @_exp_FLOAT (npy_float * op , npy_float * ip , const npy_intp array_size )
1291
1290
{
1292
1291
const npy_int num_lanes = @BYTES @/sizeof (npy_float );
1293
1292
npy_float xmax = 88.72283935546875f ;
@@ -1312,21 +1311,24 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
1312
1311
@vtype @ poly , num_poly , denom_poly , quadrant ;
1313
1312
@vtype @i exponent ;
1314
1313
1315
- @mask @ xmax_mask , xmin_mask ;
1314
+ @mask @ xmax_mask , xmin_mask , nan_mask , inf_mask ;
1316
1315
@mask @ load_mask = @isa @_get_full_load_mask ();
1317
- npy_int num_remaining_elements = array_size ;
1316
+ npy_intp num_remaining_elements = array_size ;
1317
+ npy_intp set_overflow = 0 ;
1318
1318
1319
1319
while (num_remaining_elements > 0 ) {
1320
1320
1321
1321
if (num_remaining_elements < num_lanes )
1322
1322
load_mask = @isa @_get_partial_load_mask (num_remaining_elements ,
1323
1323
num_lanes );
1324
1324
@vtype @ x = @isa @_masked_load (load_mask , ip );
1325
+
1325
1326
xmax_mask = _mm @vsize @_cmp_ps @vsub @(x , _mm @vsize @_set1_ps (xmax ), _CMP_GE_OQ );
1326
1327
xmin_mask = _mm @vsize @_cmp_ps @vsub @(x , _mm @vsize @_set1_ps (xmin ), _CMP_LE_OQ );
1327
-
1328
- x = @isa @_set_masked_lanes (x , zeros_f ,
1329
- @and_masks @(xmax_mask ,xmin_mask ));
1328
+ nan_mask = _mm @vsize @_cmp_ps @vsub @(x , x , _CMP_NEQ_UQ );
1329
+ inf_mask = _mm @vsize @_cmp_ps @vsub @(x , inf , _CMP_EQ_OQ );
1330
+ x = @isa @_set_masked_lanes (x , zeros_f , @or_masks @(
1331
+ @or_masks @(nan_mask , xmin_mask ), xmax_mask ));
1330
1332
1331
1333
quadrant = _mm @vsize @_mul_ps (x , log2e );
1332
1334
@@ -1335,8 +1337,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
1335
1337
quadrant = _mm @vsize @_sub_ps (quadrant , cvt_magic );
1336
1338
1337
1339
/* Cody-Waite's range reduction algorithm */
1338
- x = @isa @_range_reduction (x , quadrant ,
1339
- codyw_c1 , codyw_c2 , zeros_f );
1340
+ x = @isa @_range_reduction (x , quadrant , codyw_c1 , codyw_c2 , zeros_f );
1340
1341
1341
1342
num_poly = @fmadd @(exp_p5 , x , exp_p4 );
1342
1343
num_poly = @fmadd @(num_poly , x , exp_p3 );
@@ -1357,16 +1358,27 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
1357
1358
_mm @vsize @_add_epi32 (
1358
1359
_mm @vsize @_castps_si @vsize @(poly ), exponent ));
1359
1360
1360
- /* elem > xmax; return inf, elem < xmin; return 0.0f */
1361
+ /*
1362
+ * elem > xmax; return inf
1363
+ * elem < xmin; return 0.0f
1364
+ * elem = +/- nan, return nan
1365
+ */
1366
+ poly = @isa @_set_masked_lanes (poly , _mm @vsize @_set1_ps (NPY_NANF ), nan_mask );
1361
1367
poly = @isa @_set_masked_lanes (poly , inf , xmax_mask );
1362
1368
poly = @isa @_set_masked_lanes (poly , zeros_f , xmin_mask );
1363
1369
1364
1370
@masked_store @(op , @cvtps_epi32 @(load_mask ), poly );
1365
1371
1372
+ set_overflow += _mm_popcnt_u32 (
1373
+ @mask_to_int @(xmax_mask ) ^ @mask_to_int @(inf_mask ));
1374
+
1366
1375
ip += num_lanes ;
1367
1376
op += num_lanes ;
1368
1377
num_remaining_elements -= num_lanes ;
1369
1378
}
1379
+
1380
+ if (set_overflow )
1381
+ _mm_setcsr (_mm_getcsr () | (0x1 << 3 ));
1370
1382
}
1371
1383
1372
1384
/*
@@ -1384,7 +1396,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
1384
1396
*/
1385
1397
1386
1398
static NPY_GCC_OPT_3 NPY_GCC_TARGET_ @ISA @ void
1387
- @ISA @_log_FLOAT (npy_float * op , npy_float * ip , const npy_int array_size )
1399
+ @ISA @_log_FLOAT (npy_float * op , npy_float * ip , const npy_intp array_size )
1388
1400
{
1389
1401
const npy_int num_lanes = @BYTES @/sizeof (npy_float );
1390
1402
@@ -1410,7 +1422,7 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void
1410
1422
1411
1423
@mask @ inf_nan_mask , sqrt2_mask , zero_mask , negx_mask ;
1412
1424
@mask @ load_mask = @isa @_get_full_load_mask ();
1413
- npy_int num_remaining_elements = array_size ;
1425
+ npy_intp num_remaining_elements = array_size ;
1414
1426
1415
1427
while (num_remaining_elements > 0 ) {
1416
1428
0 commit comments