From 5b3f9fc70e2abba020108c2575a5d32e5e638ed9 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Thu, 13 Apr 2023 14:33:03 +0200 Subject: [PATCH 1/3] Adding Vector128.Narrow as intrinsic. --- src/mono/mono/arch/arm64/arm64-codegen.h | 16 ++++----------- src/mono/mono/mini/cpu-arm64.mdesc | 4 ++++ src/mono/mono/mini/mini-arm64.c | 26 +++++++++++++++++++++++- src/mono/mono/mini/simd-intrinsics.c | 18 +++++++++++----- 4 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index 20796854c42ff0..3c2ac5b53ab866 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1208,6 +1208,8 @@ arm_encode_arith_imm (int imm, guint32 *shift) // type - data type of vector elements, one of {TYPE_I8, TYPE_I16, TYPE_I32, TYPE_I64} #define arm_neon_abs(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b0, (type), 0b01011, (rd), (rn)) #define arm_neon_neg(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, (type), 0b01011, (rd), (rn)) +#define arm_neon_xtn(p, type, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, (type), 0b10010, (rd), (rn)) +#define arm_neon_xtn2(p, type, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, (type), 0b10010, (rd), (rn)) // Parametrized variants of the float opcodes // width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL} @@ -1215,6 +1217,8 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fabs(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b0, 0b10 | (type), 0b01111, (rd), (rn)) #define arm_neon_fneg(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b01111, (rd), (rn)) #define arm_neon_fsqrt(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11111, (rd), (rn)) +#define arm_neon_fcvtn(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10110, (rd), (rn)) +#define arm_neon_fcvtn2(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10110, (rd), (rn)) // Parametrized variants of the bitwise opcodes // width - determines if full register or its lower half is used, one of {VREG_LOW, VREG_FULL} @@ -1304,13 +1308,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_abs_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b01011, (rd), (rn)) #define arm_neon_abs_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_8, 0b01011, (rd), (rn)) -#define arm_neon_xtn_8b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10010, (rd), (rn)) -#define arm_neon_xtn2_8b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10010, (rd), (rn)) -#define arm_neon_xtn_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10010, (rd), (rn)) -#define arm_neon_xtn2_8h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10010, (rd), (rn)) -#define arm_neon_xtn_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10010, (rd), (rn)) -#define arm_neon_xtn2_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10010, (rd), (rn)) - #define arm_neon_sqxtn_8b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10100, (rd), (rn)) #define arm_neon_sqxtn2_8b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10100, (rd), (rn)) #define arm_neon_sqxtn_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10100, (rd), (rn)) @@ -1318,11 +1315,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_sqxtn_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10100, (rd), (rn)) #define arm_neon_sqxtn2_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10100, (rd), (rn)) -#define arm_neon_fcvtn_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10110, (rd), (rn)) -#define arm_neon_fcvtn2_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10110, (rd), (rn)) -#define arm_neon_fcvtn_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10110, (rd), (rn)) -#define arm_neon_fcvtn2_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10110, (rd), (rn)) - #define arm_neon_fcvtl_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10111, (rd), (rn)) #define arm_neon_fcvtl2_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10111, (rd), (rn)) #define arm_neon_fcvtl_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn)) diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc index 7176dd6309186a..ababa740581e13 100644 --- a/src/mono/mono/mini/cpu-arm64.mdesc +++ b/src/mono/mono/mini/cpu-arm64.mdesc @@ -533,6 +533,10 @@ create_scalar_unsafe_int: dest:x src1:i len:4 create_scalar_unsafe_float: dest:x src1:f len:4 arm64_bic: dest:x src1:x src2:x len:4 bitwise_select: dest:x src1:x src2:x src3:x len:12 +arm64_xtn: dest:x src1:x len:4 +arm64_xtn2: dest:x src1:x src2:x len:4 clob:1 +arm64_fcvtn: dest:x src1:x len:4 +arm64_fcvtn2: dest:x src1:x src2:x len:4 clob:1 generic_class_init: src1:a len:44 clob:c gc_safe_point: src1:i len:12 clob:c diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index a97237a9f9fdb2..770cc12cb3710d 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -3774,7 +3774,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) code = emit_ldrfpq (code, dreg, sreg1, ins->inst_offset); break; case OP_XMOVE: - arm_neon_mov (code, dreg, sreg1); + if(dreg != sreg1) + arm_neon_mov (code, dreg, sreg1); break; case OP_XCONST: { if (cfg->compile_aot && cfg->code_exec_only) { @@ -3848,6 +3849,29 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) arm_neon_ins_e(code, t, dreg, sreg1, ins->inst_c0, 0); break; } + case OP_ARM64_XTN: + // The '-1' here and in XTN2 is to account for the arm_neon_xtn macro defining + // its type as the type of the destination. Here inst_c1 is the type of the + // source data. Since XTN(2) steps down the type by one; e.g. I4 to I2, we + // subtract unity. + arm_neon_xtn (code, get_type_size_macro (ins->inst_c1) - 1, dreg, sreg1); + break; + + case OP_ARM64_XTN2: + g_assert (dreg == sreg1); + arm_neon_xtn2 (code, get_type_size_macro (ins->inst_c1) - 1, dreg, sreg2); + break; + + case OP_ARM64_FCVTN: + // Only double->float is supported here, while arm64 can also do float->half. + arm_neon_fcvtn (code, dreg, sreg1); + break; + + case OP_ARM64_FCVTN2: + g_assert (dreg == sreg1); + arm_neon_fcvtn2 (code, dreg, sreg2); + break; + case OP_ARM64_XADDV: { switch (ins->inst_c0) { case INTRINS_AARCH64_ADV_SIMD_FADDV: diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index cd4d498f0b740e..b1cfaba1a7d1f7 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1238,7 +1238,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_GetElement: case SN_GetLower: case SN_GetUpper: - case SN_Narrow: case SN_Shuffle: case SN_ToVector128: case SN_ToVector128Unsafe: @@ -1650,8 +1649,11 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (size == 16) { switch (arg0_type) { case MONO_TYPE_R8: { - MonoInst *ins = emit_simd_ins (cfg, arg_class, OP_ARM64_FCVTN, args [0]->dreg, -1); - return emit_simd_ins (cfg, arg_class, OP_ARM64_FCVTN2, ins->dreg, args [1]->dreg); + MonoInst* ins = emit_simd_ins (cfg, arg_class, OP_ARM64_FCVTN, args [0]->dreg, -1); + ins->inst_c1 = arg0_type; + MonoInst* ret = emit_simd_ins (cfg, arg_class, OP_ARM64_FCVTN2, ins->dreg, args [1]->dreg); + ret->inst_c1 = arg0_type; + return ret; } case MONO_TYPE_I2: case MONO_TYPE_I4: @@ -1659,13 +1661,19 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case MONO_TYPE_U2: case MONO_TYPE_U4: case MONO_TYPE_U8: { - MonoInst *ins = emit_simd_ins (cfg, arg_class, OP_ARM64_XTN, args [0]->dreg, -1); - return emit_simd_ins (cfg, arg_class, OP_ARM64_XTN2, ins->dreg, args [1]->dreg); + MonoInst* ins = emit_simd_ins (cfg, arg_class, OP_ARM64_XTN, args [0]->dreg, -1); + ins->inst_c1 = arg0_type; + MonoInst* ret = emit_simd_ins (cfg, arg_class, OP_ARM64_XTN2, ins->dreg, args [1]->dreg); + ret->inst_c1 = arg0_type; + return ret; } default: return NULL; } } else { + if (!COMPILE_LLVM (cfg)) + return NULL; + switch (arg0_type) { case MONO_TYPE_R8: { //Widen arg0 From 9ce3976ba2e4f9458cb3b63e5fde129430a7b456 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Thu, 13 Apr 2023 15:46:55 +0200 Subject: [PATCH 2/3] Adding Vector128.WidenLower,WidenUpper as intrinsics on arm64. --- src/mono/mono/arch/arm64/arm64-codegen.h | 38 ++++++++++-------------- src/mono/mono/mini/cpu-arm64.mdesc | 1 + src/mono/mono/mini/mini-arm64.c | 2 ++ src/mono/mono/mini/mini-ops.h | 1 + src/mono/mono/mini/simd-arm64.h | 6 ++++ src/mono/mono/mini/simd-intrinsics.c | 34 ++++++++++++++------- 6 files changed, 49 insertions(+), 33 deletions(-) diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index 3c2ac5b53ab866..1f0166ccee484f 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1219,6 +1219,9 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_fsqrt(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11111, (rd), (rn)) #define arm_neon_fcvtn(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10110, (rd), (rn)) #define arm_neon_fcvtn2(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10110, (rd), (rn)) +#define arm_neon_fcvtl(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn)) +#define arm_neon_fcvtl2(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10111, (rd), (rn)) + // Parametrized variants of the bitwise opcodes // width - determines if full register or its lower half is used, one of {VREG_LOW, VREG_FULL} @@ -1315,11 +1318,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_sqxtn_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10100, (rd), (rn)) #define arm_neon_sqxtn2_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10100, (rd), (rn)) -#define arm_neon_fcvtl_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10111, (rd), (rn)) -#define arm_neon_fcvtl2_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10111, (rd), (rn)) -#define arm_neon_fcvtl_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn)) -#define arm_neon_fcvtl2_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10111, (rd), (rn)) - #define arm_neon_frintn_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11000, (rd), (rn)) #define arm_neon_frintn_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11000, (rd), (rn)) #define arm_neon_frintn_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11000, (rd), (rn)) @@ -2275,18 +2273,26 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_shimm_opcode(p, q, u, immh, immb, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001111000000000000010000000000 | (u) << 29 | (immh) << 19 | (immb) << 16 | (opcode) << 11, (rd), (rn)) #define arm_neon_shimm_shr_immh_immb(size, shift) (((shift) - (16 << (size))) & 0b01111111) #define arm_neon_shimm_shr_opcode(p, q, u, size, opcode, rd, rn, shift) do { \ - int32_t ___temp_emit0 = arm_neon_shimm_shr_immh_immb ((size), (shift)); \ - arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)) \ + int32_t __temp_emit0 = arm_neon_shimm_shr_immh_immb ((size), (shift)); \ + arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)); \ } while (0) #define arm_neon_shimm_shl_immh_immb(size, shift) (((shift) + (8 << (size))) & 0b01111111) #define arm_neon_shimm_shl_opcode(p, q, u, size, opcode, rd, rn, shift) do { \ - int32_t ___temp_emit0 = arm_neon_shimm_shl_immh_immb ((size), (shift)); \ - arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)) \ + int32_t __temp_emit0 = arm_neon_shimm_shl_immh_immb ((size), (shift)); \ + arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)); \ } while (0) #define arm_neon_sli(p, width, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), (width), 0b1, (type), 0b01010, (rd), (rn), (shift)) #define arm_neon_shrn(p, type, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, (type), 0b10000, (rd), (rn), (shift)) +#define arm_neon_sshll(p, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, (type), 0b10100, (rd), (rn), (shift)) +#define arm_neon_sshll2(p, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b0, (type), 0b10100, (rd), (rn), (shift)) +#define arm_neon_sxtl(p, type, rd, rn) arm_neon_sshll ((p), (type), (rd), (rn), 0) +#define arm_neon_sxtl2(p, type, rd, rn) arm_neon_sshll2 ((p), (type), (rd), (rn), 0) +#define arm_neon_ushll(p, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, (type), 0b10100, (rd), (rn), (shift)) +#define arm_neon_ushll2(p, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, (type), 0b10100, (rd), (rn), (shift)) +#define arm_neon_uxtl(p, type, rd, rn) arm_neon_ushll ((p), (type), (rd), (rn), 0) +#define arm_neon_uxtl2(p, type, rd, rn) arm_neon_ushll2 ((p), (type), (rd), (rn), 0) #define arm_neon_sshr_8b(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b00000, (rd), (rn), (shift)) #define arm_neon_sshr_16b(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b00000, (rd), (rn), (shift)) @@ -2364,13 +2370,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_sqrshrn_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10011, (rd), (rn), (shift)) #define arm_neon_sqrshrn2_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10011, (rd), (rn), (shift)) -#define arm_neon_shll_i_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10100, (rd), (rn), (shift)) -#define arm_neon_shll2_i_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10100, (rd), (rn), (shift)) -#define arm_neon_shll_i_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10100, (rd), (rn), (shift)) -#define arm_neon_shll2_i_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10100, (rd), (rn), (shift)) -#define arm_neon_shll_i_2s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10100, (rd), (rn), (shift)) -#define arm_neon_shll2_i_2s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10100, (rd), (rn), (shift)) - #define arm_neon_scvtf_i_4h(p, rd, rn, fbits) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b11100, (rd), (rn), (fbits)) #define arm_neon_scvtf_i_8h(p, rd, rn, fbits) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11100, (rd), (rn), (fbits)) #define arm_neon_scvtf_i_2s(p, rd, rn, fbits) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b11100, (rd), (rn), (fbits)) @@ -2467,13 +2466,6 @@ arm_encode_arith_imm (int imm, guint32 *shift) #define arm_neon_uqrshrn_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b1, SIZE_4, 0b10011, (rd), (rn), (shift)) #define arm_neon_uqrshrn2_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b1, SIZE_4, 0b10011, (rd), (rn), (shift)) -#define arm_neon_ushll_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b10100, (rd), (rn), (shift)) -#define arm_neon_ushll2_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b10100, (rd), (rn), (shift)) -#define arm_neon_ushll_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_2, 0b10100, (rd), (rn), (shift)) -#define arm_neon_ushll2_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b10100, (rd), (rn), (shift)) -#define arm_neon_ushll_2s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_4, 0b10100, (rd), (rn), (shift)) -#define arm_neon_ushll2_2s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_4, 0b10100, (rd), (rn), (shift)) - #define arm_neon_ucvtf_i_4h(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b1, SIZE_2, 0b11100, (rd), (rn), (shift)) #define arm_neon_ucvtf_i_8h(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11100, (rd), (rn), (shift)) #define arm_neon_ucvtf_i_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b1, SIZE_4, 0b11100, (rd), (rn), (shift)) diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc index ababa740581e13..ced3ba7b1c0766 100644 --- a/src/mono/mono/mini/cpu-arm64.mdesc +++ b/src/mono/mono/mini/cpu-arm64.mdesc @@ -537,6 +537,7 @@ arm64_xtn: dest:x src1:x len:4 arm64_xtn2: dest:x src1:x src2:x len:4 clob:1 arm64_fcvtn: dest:x src1:x len:4 arm64_fcvtn2: dest:x src1:x src2:x len:4 clob:1 +xunop: dest:x src1:x len:4 generic_class_init: src1:a len:44 clob:c gc_safe_point: src1:i len:12 clob:c diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index 770cc12cb3710d..3e5c68f8c822b4 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -35,6 +35,8 @@ #define EXPAND(x) x #define PARENTHESIZE(...) (__VA_ARGS__) #define EXPAND_FUN(m, ...) EXPAND(m PARENTHESIZE(__VA_ARGS__)) +#define OPFMT_DS dreg, sreg1 +#define OPFMT_TDS _t, dreg, sreg1 #define OPFMT_WDSS _w, dreg, sreg1, sreg2 #define OPFMT_WTDS _w, _t, dreg, sreg1 #define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2 diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index 4f45bc347d224b..574b30acf5b7ab 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1498,6 +1498,7 @@ MINI_OP(OP_XEXTRACT, "xextract", IREG, XREG, NONE) /* * Generic SIMD operations, the rest of the JIT doesn't care about the exact operation. */ +MINI_OP(OP_XUNOP, "xunop", XREG, XREG, NONE) MINI_OP(OP_XBINOP, "xbinop", XREG, XREG, XREG) MINI_OP(OP_XBINOP_FORCEINT, "xbinop_forceint", XREG, XREG, XREG) MINI_OP(OP_XBINOP_SCALAR, "xbinop_scalar", XREG, XREG, XREG) diff --git a/src/mono/mono/mini/simd-arm64.h b/src/mono/mono/mini/simd-arm64.h index 1f10471b708e65..68ee77aeb47fa3 100644 --- a/src/mono/mono/mini/simd-arm64.h +++ b/src/mono/mono/mini/simd-arm64.h @@ -49,6 +49,12 @@ SIMD_OP (128, OP_XCOMPARE_FP, CMP_GE, WTDSS, _UNDEF, SIMD_OP (128, OP_XCOMPARE_FP, CMP_LT, WTDSS_REV, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmgt, arm_neon_fcmgt) SIMD_OP (128, OP_XCOMPARE_FP, CMP_LE, WTDSS_REV, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcmge, arm_neon_fcmge) +SIMD_OP (128, OP_XUNOP, OP_SIMD_FCVTL, DS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcvtl, _UNDEF) +SIMD_OP (128, OP_XUNOP, OP_SIMD_FCVTL2, DS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fcvtl2, _UNDEF) +SIMD_OP (128, OP_XUNOP, OP_ARM64_SXTL, TDS, arm_neon_sxtl, arm_neon_sxtl, arm_neon_sxtl, _UNDEF, _UNDEF, _UNDEF) +SIMD_OP (128, OP_XUNOP, OP_ARM64_SXTL2, TDS, arm_neon_sxtl2, arm_neon_sxtl2, arm_neon_sxtl2, _UNDEF, _UNDEF, _UNDEF) +SIMD_OP (128, OP_XUNOP, OP_ARM64_UXTL, TDS, arm_neon_uxtl, arm_neon_uxtl, arm_neon_uxtl, _UNDEF, _UNDEF, _UNDEF) +SIMD_OP (128, OP_XUNOP, OP_ARM64_UXTL2, TDS, arm_neon_uxtl2, arm_neon_uxtl2, arm_neon_uxtl2, _UNDEF, _UNDEF, _UNDEF) SIMD_OP (128, OP_XBINOP, OP_IADD, WTDSS, arm_neon_add, arm_neon_add, arm_neon_add, arm_neon_add, _UNDEF, _UNDEF) SIMD_OP (128, OP_XBINOP, OP_FADD, WTDSS, _UNDEF, _UNDEF, _UNDEF, _UNDEF, arm_neon_fadd, arm_neon_fadd) SIMD_OP (128, OP_XBINOP, OP_ISUB, WTDSS, arm_neon_sub, arm_neon_sub, arm_neon_sub, arm_neon_sub, _UNDEF, _UNDEF) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index b1cfaba1a7d1f7..513f6aef804af6 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1241,8 +1241,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi case SN_Shuffle: case SN_ToVector128: case SN_ToVector128Unsafe: - case SN_WidenLower: - case SN_WidenUpper: case SN_WithElement: return NULL; default: @@ -1851,15 +1849,31 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!is_element_type_primitive (fsig->params [0])) return NULL; - int op = id == SN_WidenLower ? OP_XLOWER : OP_XUPPER; - MonoInst *lower_or_upper_half = emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); - if (type_enum_is_float (arg0_type)) { - return emit_simd_ins (cfg, klass, OP_SIMD_FCVTL, lower_or_upper_half->dreg, -1); + if (COMPILE_LLVM (cfg)) { + int op = id == SN_WidenLower ? OP_XLOWER : OP_XUPPER; + MonoInst *lower_or_upper_half = emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); + if (type_enum_is_float (arg0_type)) { + return emit_simd_ins (cfg, klass, OP_SIMD_FCVTL, lower_or_upper_half->dreg, -1); + } else { + int zero = alloc_ireg (cfg); + MONO_EMIT_NEW_ICONST (cfg, zero, 0); + op = type_enum_is_unsigned (arg0_type) ? OP_SIMD_USHLL : OP_SIMD_SSHLL; + return emit_simd_ins (cfg, klass, op, lower_or_upper_half->dreg, zero); + } } else { - int zero = alloc_ireg (cfg); - MONO_EMIT_NEW_ICONST (cfg, zero, 0); - op = type_enum_is_unsigned (arg0_type) ? OP_SIMD_USHLL : OP_SIMD_SSHLL; - return emit_simd_ins (cfg, klass, op, lower_or_upper_half->dreg, zero); + int op = 0; + gboolean is_upper = (id == SN_WidenUpper); + if (type_enum_is_float (arg0_type)) + op = is_upper ? OP_SIMD_FCVTL2 : OP_SIMD_FCVTL; + else if (type_enum_is_unsigned (arg0_type)) + op = is_upper ? OP_ARM64_UXTL2 : OP_ARM64_UXTL; + else + op = is_upper ? OP_ARM64_SXTL2 : OP_ARM64_SXTL; + + MonoInst* ins = emit_simd_ins (cfg, klass, OP_XUNOP, args [0]->dreg, -1); + ins->inst_c0 = op; + ins->inst_c1 = arg0_type; + return ins; } #else return NULL; From 42dd837a0049becfd147de68ff2fd1814b39caa3 Mon Sep 17 00:00:00 2001 From: Jan Dupej Date: Fri, 14 Apr 2023 15:54:37 +0200 Subject: [PATCH 3/3] Fixed WASM builds. --- src/mono/mono/mini/simd-intrinsics.c | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 513f6aef804af6..4acc70fce368c6 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -1845,11 +1845,26 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi } case SN_WidenLower: case SN_WidenUpper: { -#if defined(TARGET_ARM64) || defined(TARGET_WASM) if (!is_element_type_primitive (fsig->params [0])) return NULL; - - if (COMPILE_LLVM (cfg)) { +#if defined(TARGET_ARM64) + if (!COMPILE_LLVM (cfg)) { + int subop = 0; + gboolean is_upper = (id == SN_WidenUpper); + if (type_enum_is_float (arg0_type)) + subop = is_upper ? OP_SIMD_FCVTL2 : OP_SIMD_FCVTL; + else if (type_enum_is_unsigned (arg0_type)) + subop = is_upper ? OP_ARM64_UXTL2 : OP_ARM64_UXTL; + else + subop = is_upper ? OP_ARM64_SXTL2 : OP_ARM64_SXTL; + + MonoInst* ins = emit_simd_ins (cfg, klass, OP_XUNOP, args [0]->dreg, -1); + ins->inst_c0 = subop; + ins->inst_c1 = arg0_type; + return ins; + } +#endif +#if defined(TARGET_ARM64) || defined(TARGET_WASM) int op = id == SN_WidenLower ? OP_XLOWER : OP_XUPPER; MonoInst *lower_or_upper_half = emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args); if (type_enum_is_float (arg0_type)) { @@ -1860,21 +1875,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi op = type_enum_is_unsigned (arg0_type) ? OP_SIMD_USHLL : OP_SIMD_SSHLL; return emit_simd_ins (cfg, klass, op, lower_or_upper_half->dreg, zero); } - } else { - int op = 0; - gboolean is_upper = (id == SN_WidenUpper); - if (type_enum_is_float (arg0_type)) - op = is_upper ? OP_SIMD_FCVTL2 : OP_SIMD_FCVTL; - else if (type_enum_is_unsigned (arg0_type)) - op = is_upper ? OP_ARM64_UXTL2 : OP_ARM64_UXTL; - else - op = is_upper ? OP_ARM64_SXTL2 : OP_ARM64_SXTL; - - MonoInst* ins = emit_simd_ins (cfg, klass, OP_XUNOP, args [0]->dreg, -1); - ins->inst_c0 = op; - ins->inst_c1 = arg0_type; - return ins; - } #else return NULL; #endif