Update reedsolomon/highwayhash to start using ppc64le support (#7003)
Thanks to @fwessels for the upstream work on reedsolomon and highwayhash which has resulted in 10x performance improvement on ppc64 architecture.master
parent
bc67410548
commit
def04f01cf
@ -0,0 +1,67 @@ |
||||
//+build !noasm
|
||||
//+build !appengine
|
||||
//+build !gccgo
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
// Copyright 2018, Minio, Inc.
|
||||
|
||||
package reedsolomon |
||||
|
||||
//go:noescape
|
||||
func galMulPpc(low, high, in, out []byte) |
||||
|
||||
//go:noescape
|
||||
func galMulPpcXor(low, high, in, out []byte) |
||||
|
||||
// This is what the assembler routines do in blocks of 16 bytes:
|
||||
/* |
||||
func galMulPpc(low, high, in, out []byte) { |
||||
for n, input := range in { |
||||
l := input & 0xf |
||||
h := input >> 4 |
||||
out[n] = low[l] ^ high[h] |
||||
} |
||||
} |
||||
func galMulPpcXor(low, high, in, out []byte) { |
||||
for n, input := range in { |
||||
l := input & 0xf |
||||
h := input >> 4 |
||||
out[n] ^= low[l] ^ high[h] |
||||
} |
||||
} |
||||
*/ |
||||
|
||||
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) { |
||||
done := (len(in) >> 4) << 4 |
||||
if done > 0 { |
||||
galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out) |
||||
} |
||||
remain := len(in) - done |
||||
if remain > 0 { |
||||
mt := mulTable[c] |
||||
for i := done; i < len(in); i++ { |
||||
out[i] = mt[in[i]] |
||||
} |
||||
} |
||||
} |
||||
|
||||
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) { |
||||
done := (len(in) >> 4) << 4 |
||||
if done > 0 { |
||||
galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out) |
||||
} |
||||
remain := len(in) - done |
||||
if remain > 0 { |
||||
mt := mulTable[c] |
||||
for i := done; i < len(in); i++ { |
||||
out[i] ^= mt[in[i]] |
||||
} |
||||
} |
||||
} |
||||
|
||||
// slice galois add
|
||||
func sliceXor(in, out []byte, sse2 bool) { |
||||
for n, input := range in { |
||||
out[n] ^= input |
||||
} |
||||
} |
@ -0,0 +1,126 @@ |
||||
//+build !noasm !appengine !gccgo |
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details. |
||||
// Copyright 2018, Minio, Inc. |
||||
|
||||
#include "textflag.h" |
||||
|
||||
#define LOW R3 |
||||
#define HIGH R4 |
||||
#define IN R5 |
||||
#define LEN R6 |
||||
#define OUT R7 |
||||
#define CONSTANTS R8 |
||||
#define OFFSET R9 |
||||
#define OFFSET1 R10 |
||||
#define OFFSET2 R11 |
||||
|
||||
#define X6 VS34 |
||||
#define X6_ V2 |
||||
#define X7 VS35 |
||||
#define X7_ V3 |
||||
#define MSG VS36 |
||||
#define MSG_ V4 |
||||
#define MSG_HI VS37 |
||||
#define MSG_HI_ V5 |
||||
#define RESULT VS38 |
||||
#define RESULT_ V6 |
||||
#define ROTATE VS39 |
||||
#define ROTATE_ V7 |
||||
#define MASK VS40 |
||||
#define MASK_ V8 |
||||
#define FLIP VS41 |
||||
#define FLIP_ V9 |
||||
|
||||
|
||||
// func galMulPpc(low, high, in, out []byte) |
||||
TEXT ·galMulPpc(SB), NOFRAME|NOSPLIT, $0-96 |
||||
MOVD low+0(FP), LOW |
||||
MOVD high+24(FP), HIGH |
||||
MOVD in+48(FP), IN |
||||
MOVD in_len+56(FP), LEN |
||||
MOVD out+72(FP), OUT |
||||
|
||||
MOVD $16, OFFSET1 |
||||
MOVD $32, OFFSET2 |
||||
|
||||
MOVD $·constants(SB), CONSTANTS |
||||
LXVD2X (CONSTANTS)(R0), ROTATE |
||||
LXVD2X (CONSTANTS)(OFFSET1), MASK |
||||
LXVD2X (CONSTANTS)(OFFSET2), FLIP |
||||
|
||||
LXVD2X (LOW)(R0), X6 |
||||
LXVD2X (HIGH)(R0), X7 |
||||
VPERM X6_, V31, FLIP_, X6_ |
||||
VPERM X7_, V31, FLIP_, X7_ |
||||
|
||||
MOVD $0, OFFSET |
||||
|
||||
loop: |
||||
LXVD2X (IN)(OFFSET), MSG |
||||
|
||||
VSRB MSG_, ROTATE_, MSG_HI_ |
||||
VAND MSG_, MASK_, MSG_ |
||||
VPERM X6_, V31, MSG_, MSG_ |
||||
VPERM X7_, V31, MSG_HI_, MSG_HI_ |
||||
|
||||
VXOR MSG_, MSG_HI_, MSG_ |
||||
|
||||
STXVD2X MSG, (OUT)(OFFSET) |
||||
|
||||
ADD $16, OFFSET, OFFSET |
||||
CMP LEN, OFFSET |
||||
BGT loop |
||||
RET |
||||
|
||||
|
||||
// func galMulPpcXorlow, high, in, out []byte) |
||||
TEXT ·galMulPpcXor(SB), NOFRAME|NOSPLIT, $0-96 |
||||
MOVD low+0(FP), LOW |
||||
MOVD high+24(FP), HIGH |
||||
MOVD in+48(FP), IN |
||||
MOVD in_len+56(FP), LEN |
||||
MOVD out+72(FP), OUT |
||||
|
||||
MOVD $16, OFFSET1 |
||||
MOVD $32, OFFSET2 |
||||
|
||||
MOVD $·constants(SB), CONSTANTS |
||||
LXVD2X (CONSTANTS)(R0), ROTATE |
||||
LXVD2X (CONSTANTS)(OFFSET1), MASK |
||||
LXVD2X (CONSTANTS)(OFFSET2), FLIP |
||||
|
||||
LXVD2X (LOW)(R0), X6 |
||||
LXVD2X (HIGH)(R0), X7 |
||||
VPERM X6_, V31, FLIP_, X6_ |
||||
VPERM X7_, V31, FLIP_, X7_ |
||||
|
||||
MOVD $0, OFFSET |
||||
|
||||
loopXor: |
||||
LXVD2X (IN)(OFFSET), MSG |
||||
LXVD2X (OUT)(OFFSET), RESULT |
||||
|
||||
VSRB MSG_, ROTATE_, MSG_HI_ |
||||
VAND MSG_, MASK_, MSG_ |
||||
VPERM X6_, V31, MSG_, MSG_ |
||||
VPERM X7_, V31, MSG_HI_, MSG_HI_ |
||||
|
||||
VXOR MSG_, MSG_HI_, MSG_ |
||||
VXOR MSG_, RESULT_, RESULT_ |
||||
|
||||
STXVD2X RESULT, (OUT)(OFFSET) |
||||
|
||||
ADD $16, OFFSET, OFFSET |
||||
CMP LEN, OFFSET |
||||
BGT loopXor |
||||
RET |
||||
|
||||
DATA ·constants+0x0(SB)/8, $0x0404040404040404 |
||||
DATA ·constants+0x8(SB)/8, $0x0404040404040404 |
||||
DATA ·constants+0x10(SB)/8, $0x0f0f0f0f0f0f0f0f |
||||
DATA ·constants+0x18(SB)/8, $0x0f0f0f0f0f0f0f0f |
||||
DATA ·constants+0x20(SB)/8, $0x0706050403020100 |
||||
DATA ·constants+0x28(SB)/8, $0x0f0e0d0c0b0a0908 |
||||
|
||||
GLOBL ·constants(SB), 8, $48 |
@ -0,0 +1,33 @@ |
||||
//+build !noasm
|
||||
|
||||
// Copyright (c) 2017 Minio Inc. All rights reserved.
|
||||
// Use of this source code is governed by a license that can be
|
||||
// found in the LICENSE file.
|
||||
|
||||
package highwayhash |
||||
|
||||
var ( |
||||
useSSE4 = false |
||||
useAVX2 = false |
||||
useNEON = false |
||||
useVMX = true |
||||
) |
||||
|
||||
//go:noescape
|
||||
func updatePpc64Le(state *[16]uint64, msg []byte) |
||||
|
||||
func initialize(state *[16]uint64, key []byte) { |
||||
initializeGeneric(state, key) |
||||
} |
||||
|
||||
func update(state *[16]uint64, msg []byte) { |
||||
if useVMX { |
||||
updatePpc64Le(state, msg) |
||||
} else { |
||||
updateGeneric(state, msg) |
||||
} |
||||
} |
||||
|
||||
func finalize(out []byte, state *[16]uint64) { |
||||
finalizeGeneric(out, state) |
||||
} |
@ -0,0 +1,183 @@ |
||||
//+build !noasm !appengine |
||||
|
||||
// |
||||
// Minio Cloud Storage, (C) 2018 Minio, Inc. |
||||
// |
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License. |
||||
// You may obtain a copy of the License at |
||||
// |
||||
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
// |
||||
// Unless required by applicable law or agreed to in writing, software |
||||
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
// See the License for the specific language governing permissions and |
||||
// limitations under the License. |
||||
// |
||||
|
||||
#include "textflag.h" |
||||
|
||||
// Definition of registers |
||||
#define V0_LO VS32 |
||||
#define V0_LO_ V0 |
||||
#define V0_HI VS33 |
||||
#define V0_HI_ V1 |
||||
#define V1_LO VS34 |
||||
#define V1_LO_ V2 |
||||
#define V1_HI VS35 |
||||
#define V1_HI_ V3 |
||||
#define MUL0_LO VS36 |
||||
#define MUL0_LO_ V4 |
||||
#define MUL0_HI VS37 |
||||
#define MUL0_HI_ V5 |
||||
#define MUL1_LO VS38 |
||||
#define MUL1_LO_ V6 |
||||
#define MUL1_HI VS39 |
||||
#define MUL1_HI_ V7 |
||||
|
||||
// Message |
||||
#define MSG_LO VS40 |
||||
#define MSG_LO_ V8 |
||||
#define MSG_HI VS41 |
||||
|
||||
// Constants |
||||
#define ROTATE VS42 |
||||
#define ROTATE_ V10 |
||||
#define MASK VS43 |
||||
#define MASK_ V11 |
||||
|
||||
// Temps |
||||
#define TEMP1 VS44 |
||||
#define TEMP1_ V12 |
||||
#define TEMP2 VS45 |
||||
#define TEMP2_ V13 |
||||
#define TEMP3 VS46 |
||||
#define TEMP3_ V14 |
||||
#define TEMP4_ V15 |
||||
#define TEMP5_ V16 |
||||
#define TEMP6_ V17 |
||||
#define TEMP7_ V18 |
||||
|
||||
// Regular registers |
||||
#define STATE R3 |
||||
#define MSG_BASE R4 |
||||
#define MSG_LEN R5 |
||||
#define CONSTANTS R6 |
||||
#define P1 R7 |
||||
#define P2 R8 |
||||
#define P3 R9 |
||||
#define P4 R10 |
||||
#define P5 R11 |
||||
#define P6 R12 |
||||
#define P7 R14 // avoid using R13 |
||||
|
||||
TEXT ·updatePpc64Le(SB), NOFRAME|NOSPLIT, $0-32 |
||||
MOVD state+0(FP), STATE |
||||
MOVD msg_base+8(FP), MSG_BASE |
||||
MOVD msg_len+16(FP), MSG_LEN // length of message |
||||
|
||||
// Sanity check for length |
||||
CMPU MSG_LEN, $31 |
||||
BLE complete |
||||
|
||||
// Setup offsets |
||||
MOVD $16, P1 |
||||
MOVD $32, P2 |
||||
MOVD $48, P3 |
||||
MOVD $64, P4 |
||||
MOVD $80, P5 |
||||
MOVD $96, P6 |
||||
MOVD $112, P7 |
||||
|
||||
// Load state |
||||
LXVD2X (STATE)(R0), V0_LO |
||||
LXVD2X (STATE)(P1), V0_HI |
||||
LXVD2X (STATE)(P2), V1_LO |
||||
LXVD2X (STATE)(P3), V1_HI |
||||
LXVD2X (STATE)(P4), MUL0_LO |
||||
LXVD2X (STATE)(P5), MUL0_HI |
||||
LXVD2X (STATE)(P6), MUL1_LO |
||||
LXVD2X (STATE)(P7), MUL1_HI |
||||
XXPERMDI V0_LO, V0_LO, $2, V0_LO |
||||
XXPERMDI V0_HI, V0_HI, $2, V0_HI |
||||
XXPERMDI V1_LO, V1_LO, $2, V1_LO |
||||
XXPERMDI V1_HI, V1_HI, $2, V1_HI |
||||
XXPERMDI MUL0_LO, MUL0_LO, $2, MUL0_LO |
||||
XXPERMDI MUL0_HI, MUL0_HI, $2, MUL0_HI |
||||
XXPERMDI MUL1_LO, MUL1_LO, $2, MUL1_LO |
||||
XXPERMDI MUL1_HI, MUL1_HI, $2, MUL1_HI |
||||
|
||||
// Load constants table pointer |
||||
MOVD $·constants(SB), CONSTANTS |
||||
LXVD2X (CONSTANTS)(R0), ROTATE |
||||
LXVD2X (CONSTANTS)(P1), MASK |
||||
XXLNAND MASK, MASK, MASK |
||||
|
||||
loop: |
||||
// Main highwayhash update loop |
||||
LXVD2X (MSG_BASE)(R0), MSG_LO |
||||
VADDUDM V0_LO_, MUL1_LO_, TEMP1_ |
||||
VRLD V0_LO_, ROTATE_, TEMP2_ |
||||
VADDUDM MUL1_HI_, V0_HI_, TEMP3_ |
||||
LXVD2X (MSG_BASE)(P1), MSG_HI |
||||
ADD $32, MSG_BASE, MSG_BASE |
||||
XXPERMDI MSG_LO, MSG_LO, $2, MSG_LO |
||||
XXPERMDI MSG_HI, MSG_HI, $2, V0_LO |
||||
VADDUDM MSG_LO_, MUL0_LO_, MSG_LO_ |
||||
VADDUDM V0_LO_, MUL0_HI_, V0_LO_ |
||||
VADDUDM MSG_LO_, V1_LO_, V1_LO_ |
||||
VSRD V0_HI_, ROTATE_, MSG_LO_ |
||||
VADDUDM V0_LO_, V1_HI_, V1_HI_ |
||||
VPERM V1_LO_, V1_LO_, MASK_, V0_LO_ |
||||
VMULOUW V1_LO_, TEMP2_, TEMP2_ |
||||
VPERM V1_HI_, V1_HI_, MASK_, TEMP7_ |
||||
VADDUDM V0_LO_, TEMP1_, V0_LO_ |
||||
VMULOUW V1_HI_, MSG_LO_, MSG_LO_ |
||||
VADDUDM TEMP7_, TEMP3_, V0_HI_ |
||||
VPERM V0_LO_, V0_LO_, MASK_, TEMP6_ |
||||
VRLD V1_LO_, ROTATE_, TEMP4_ |
||||
VSRD V1_HI_, ROTATE_, TEMP5_ |
||||
VPERM V0_HI_, V0_HI_, MASK_, TEMP7_ |
||||
XXLXOR MUL0_LO, TEMP2, MUL0_LO |
||||
VMULOUW TEMP1_, TEMP4_, TEMP1_ |
||||
VMULOUW TEMP3_, TEMP5_, TEMP3_ |
||||
XXLXOR MUL0_HI, MSG_LO, MUL0_HI |
||||
XXLXOR MUL1_LO, TEMP1, MUL1_LO |
||||
XXLXOR MUL1_HI, TEMP3, MUL1_HI |
||||
VADDUDM TEMP6_, V1_LO_, V1_LO_ |
||||
VADDUDM TEMP7_, V1_HI_, V1_HI_ |
||||
|
||||
SUB $32, MSG_LEN, MSG_LEN |
||||
CMPU MSG_LEN, $32 |
||||
BGE loop |
||||
|
||||
// Save state |
||||
XXPERMDI V0_LO, V0_LO, $2, V0_LO |
||||
XXPERMDI V0_HI, V0_HI, $2, V0_HI |
||||
XXPERMDI V1_LO, V1_LO, $2, V1_LO |
||||
XXPERMDI V1_HI, V1_HI, $2, V1_HI |
||||
XXPERMDI MUL0_LO, MUL0_LO, $2, MUL0_LO |
||||
XXPERMDI MUL0_HI, MUL0_HI, $2, MUL0_HI |
||||
XXPERMDI MUL1_LO, MUL1_LO, $2, MUL1_LO |
||||
XXPERMDI MUL1_HI, MUL1_HI, $2, MUL1_HI |
||||
STXVD2X V0_LO, (STATE)(R0) |
||||
STXVD2X V0_HI, (STATE)(P1) |
||||
STXVD2X V1_LO, (STATE)(P2) |
||||
STXVD2X V1_HI, (STATE)(P3) |
||||
STXVD2X MUL0_LO, (STATE)(P4) |
||||
STXVD2X MUL0_HI, (STATE)(P5) |
||||
STXVD2X MUL1_LO, (STATE)(P6) |
||||
STXVD2X MUL1_HI, (STATE)(P7) |
||||
|
||||
complete: |
||||
RET |
||||
|
||||
|
||||
// Constants table |
||||
DATA ·constants+0x0(SB)/8, $0x0000000000000020 |
||||
DATA ·constants+0x8(SB)/8, $0x0000000000000020 |
||||
DATA ·constants+0x10(SB)/8, $0x070806090d0a040b // zipper merge constant |
||||
DATA ·constants+0x18(SB)/8, $0x000f010e05020c03 // zipper merge constant |
||||
|
||||
GLOBL ·constants(SB), 8, $32 |
Loading…
Reference in new issue