From e235e8eae67f16b3a58817cfdff729693faf2665 Mon Sep 17 00:00:00 2001 From: Mauri de Souza Meneguzzo Date: Wed, 1 Nov 2023 11:46:07 -0300 Subject: [PATCH] x/crypto/chacha20poly1305: guard PSHUFP usage with GOAMD64_v2 The PSHUFP instruction is part of SSE which is only v2+ but it is being used without the GOAMD64_v2 guard. The ROL routines were copied from Go's internal/chacha20poly1305 package. Fixes golang/go#63871 --- chacha20poly1305/chacha20poly1305_amd64.s | 24 +++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/chacha20poly1305/chacha20poly1305_amd64.s b/chacha20poly1305/chacha20poly1305_amd64.s index 541d696b67..731d2ac6db 100644 --- a/chacha20poly1305/chacha20poly1305_amd64.s +++ b/chacha20poly1305/chacha20poly1305_amd64.s @@ -183,11 +183,31 @@ GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 + // Some macros + +// ROL rotates the uint32s in register R left by N bits, using temporary T. +#define ROL(N, R, T) \ + MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R + +// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. +#ifdef GOAMD64_v2 +#define ROL16(R, T) PSHUFB ·rol16<>(SB), R +#else +#define ROL16(R, T) ROL(16, R, T) +#endif + +// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. +#ifdef GOAMD64_v2 +#define ROL8(R, T) PSHUFB ·rol8<>(SB), R +#else +#define ROL8(R, T) ROL(8, R, T) +#endif + #define chachaQR(A, B, C, D, T) \ - PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ + PADDD B, A; PXOR A, D; ROL16(D, T) \ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ - PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ + PADDD B, A; PXOR A, D; ROL8(D, T) \ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B #define chachaQR_AVX2(A, B, C, D, T) \