From 1a57267a17c2fc17fb6e104846fabc3e363c326c Mon Sep 17 00:00:00 2001 From: Emile Date: Fri, 16 Aug 2024 19:50:26 +0200 Subject: initial commit --- vendor/golang.org/x/crypto/argon2/blamka_amd64.s | 243 +++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 vendor/golang.org/x/crypto/argon2/blamka_amd64.s (limited to 'vendor/golang.org/x/crypto/argon2/blamka_amd64.s') diff --git a/vendor/golang.org/x/crypto/argon2/blamka_amd64.s b/vendor/golang.org/x/crypto/argon2/blamka_amd64.s new file mode 100644 index 0000000..6713acc --- /dev/null +++ b/vendor/golang.org/x/crypto/argon2/blamka_amd64.s @@ -0,0 +1,243 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 && gc && !purego + +#include "textflag.h" + +DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 +DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 + +DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 +DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 + +#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ + MOVO v4, t1; \ + MOVO v5, v4; \ + MOVO t1, v5; \ + MOVO v6, t1; \ + PUNPCKLQDQ v6, t2; \ + PUNPCKHQDQ v7, v6; \ + PUNPCKHQDQ t2, v6; \ + PUNPCKLQDQ v7, t2; \ + MOVO t1, v7; \ + MOVO v2, t1; \ + PUNPCKHQDQ t2, v7; \ + PUNPCKLQDQ v3, t2; \ + PUNPCKHQDQ t2, v2; \ + PUNPCKLQDQ t1, t2; \ + PUNPCKHQDQ t2, v3 + +#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ + MOVO v4, t1; \ + MOVO v5, v4; \ + MOVO t1, v5; \ + MOVO v2, t1; \ + PUNPCKLQDQ v2, t2; \ + PUNPCKHQDQ v3, v2; \ + PUNPCKHQDQ t2, v2; \ + PUNPCKLQDQ v3, t2; \ + MOVO t1, v3; \ + MOVO v6, t1; \ + PUNPCKHQDQ t2, v3; \ + PUNPCKLQDQ v7, t2; \ + PUNPCKHQDQ t2, v6; \ + PUNPCKLQDQ t1, t2; \ + PUNPCKHQDQ t2, v7 + +#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \ + MOVO v0, t0; \ + PMULULQ v2, t0; \ + PADDQ v2, v0; \ + PADDQ t0, v0; \ + PADDQ t0, v0; \ + PXOR v0, v6; \ + PSHUFD $0xB1, v6, v6; \ + MOVO v4, t0; \ + PMULULQ v6, t0; \ + PADDQ v6, v4; \ + PADDQ t0, v4; \ + PADDQ t0, v4; \ + PXOR v4, v2; \ + PSHUFB c40, v2; \ + MOVO v0, t0; \ + PMULULQ v2, t0; \ + PADDQ v2, v0; \ + PADDQ t0, v0; \ + PADDQ t0, v0; \ + PXOR v0, v6; \ + PSHUFB c48, v6; \ + MOVO v4, t0; \ + PMULULQ v6, t0; \ + PADDQ v6, v4; \ + PADDQ t0, v4; \ + PADDQ t0, v4; \ + PXOR v4, v2; \ + MOVO v2, t0; \ + PADDQ v2, t0; \ + PSRLQ $63, v2; \ + PXOR t0, v2; \ + MOVO v1, t0; \ + PMULULQ v3, t0; \ + PADDQ v3, v1; \ + PADDQ t0, v1; \ + PADDQ t0, v1; \ + PXOR v1, v7; \ + PSHUFD $0xB1, v7, v7; \ + MOVO v5, t0; \ + PMULULQ v7, t0; \ + PADDQ v7, v5; \ + PADDQ t0, v5; \ + PADDQ t0, v5; \ + PXOR v5, v3; \ + PSHUFB c40, v3; \ + MOVO v1, t0; \ + PMULULQ v3, t0; \ + PADDQ v3, v1; \ + PADDQ t0, v1; \ + PADDQ t0, v1; \ + PXOR v1, v7; \ + PSHUFB c48, v7; \ + MOVO v5, t0; \ + PMULULQ v7, t0; \ + PADDQ v7, v5; \ + PADDQ t0, v5; \ + PADDQ t0, v5; \ + PXOR v5, v3; \ + MOVO v3, t0; \ + PADDQ v3, t0; \ + PSRLQ $63, v3; \ + PXOR t0, v3 + +#define LOAD_MSG_0(block, off) \ + MOVOU 8*(off+0)(block), X0; \ + MOVOU 8*(off+2)(block), X1; \ + MOVOU 8*(off+4)(block), X2; \ + MOVOU 8*(off+6)(block), X3; \ + MOVOU 8*(off+8)(block), X4; \ + MOVOU 8*(off+10)(block), X5; \ + MOVOU 8*(off+12)(block), X6; \ + MOVOU 8*(off+14)(block), X7 + +#define STORE_MSG_0(block, off) \ + MOVOU X0, 8*(off+0)(block); \ + MOVOU X1, 8*(off+2)(block); \ + MOVOU X2, 8*(off+4)(block); \ + MOVOU X3, 8*(off+6)(block); \ + MOVOU X4, 8*(off+8)(block); \ + MOVOU X5, 8*(off+10)(block); \ + MOVOU X6, 8*(off+12)(block); \ + MOVOU X7, 8*(off+14)(block) + +#define LOAD_MSG_1(block, off) \ + MOVOU 8*off+0*8(block), X0; \ + MOVOU 8*off+16*8(block), X1; \ + MOVOU 8*off+32*8(block), X2; \ + MOVOU 8*off+48*8(block), X3; \ + MOVOU 8*off+64*8(block), X4; \ + MOVOU 8*off+80*8(block), X5; \ + MOVOU 8*off+96*8(block), X6; \ + MOVOU 8*off+112*8(block), X7 + +#define STORE_MSG_1(block, off) \ + MOVOU X0, 8*off+0*8(block); \ + MOVOU X1, 8*off+16*8(block); \ + MOVOU X2, 8*off+32*8(block); \ + MOVOU X3, 8*off+48*8(block); \ + MOVOU X4, 8*off+64*8(block); \ + MOVOU X5, 8*off+80*8(block); \ + MOVOU X6, 8*off+96*8(block); \ + MOVOU X7, 8*off+112*8(block) + +#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \ + LOAD_MSG_0(block, off); \ + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ + SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ + STORE_MSG_0(block, off) + +#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \ + LOAD_MSG_1(block, off); \ + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ + SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ + HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ + SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ + STORE_MSG_1(block, off) + +// func blamkaSSE4(b *block) +TEXT ·blamkaSSE4(SB), 4, $0-8 + MOVQ b+0(FP), AX + + MOVOU ·c40<>(SB), X10 + MOVOU ·c48<>(SB), X11 + + BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) + BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) + + BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) + BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) + RET + +// func mixBlocksSSE2(out, a, b, c *block) +TEXT ·mixBlocksSSE2(SB), 4, $0-32 + MOVQ out+0(FP), DX + MOVQ a+8(FP), AX + MOVQ b+16(FP), BX + MOVQ c+24(FP), CX + MOVQ $128, DI + +loop: + MOVOU 0(AX), X0 + MOVOU 0(BX), X1 + MOVOU 0(CX), X2 + PXOR X1, X0 + PXOR X2, X0 + MOVOU X0, 0(DX) + ADDQ $16, AX + ADDQ $16, BX + ADDQ $16, CX + ADDQ $16, DX + SUBQ $2, DI + JA loop + RET + +// func xorBlocksSSE2(out, a, b, c *block) +TEXT ·xorBlocksSSE2(SB), 4, $0-32 + MOVQ out+0(FP), DX + MOVQ a+8(FP), AX + MOVQ b+16(FP), BX + MOVQ c+24(FP), CX + MOVQ $128, DI + +loop: + MOVOU 0(AX), X0 + MOVOU 0(BX), X1 + MOVOU 0(CX), X2 + MOVOU 0(DX), X3 + PXOR X1, X0 + PXOR X2, X0 + PXOR X3, X0 + MOVOU X0, 0(DX) + ADDQ $16, AX + ADDQ $16, BX + ADDQ $16, CX + ADDQ $16, DX + SUBQ $2, DI + JA loop + RET -- cgit 1.4.1