Updated version of klauspost/reedsolomon with NEON support for ARM (#4865)

7 years ago · 93f126364e
parent 6dca044ea8
commit 93f126364e
10 changed files with 1143 additions and 15 deletions
--- a/vendor/github.com/klauspost/reedsolomon/README.md
+++ b/vendor/github.com/klauspost/reedsolomon/README.md
@ -8,7 +8,7 @@
 Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go.
-This is a golang port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations.
+This is a Go port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations.
 For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/).
@ -19,11 +19,17 @@ Godoc: https://godoc.org/github.com/klauspost/reedsolomon
 # Installation
 To get the package use the standard:
 ```bash
-go get github.com/klauspost/reedsolomon
+go get -u github.com/klauspost/reedsolomon
 ```
 # Changes
 ## August 26, 2017
 *  The[`Encoder()`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) now contains an `Update` function contributed by [chenzhongtao](https://github.com/chenzhongtao).
 * [Frank Wessels](https://github.com/fwessels) kindly contributed ARM 64 bit assembly, which gives a huge performance boost on this platform.
 ## July 20, 2017
 `ReconstructData` added to [`Encoder`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) interface. This can cause compatibility issues if you implement your own Encoder. A simple workaround can be added:
@ -186,7 +192,7 @@ There is no buffering or timeouts/retry specified. If you want to add that, you
 For complete examples of a streaming encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples).
-#Advanced Options
+# Advanced Options
 You can modify internal options which affects how jobs are split between and processed by goroutines.
@ -234,6 +240,16 @@ BenchmarkReconstruct50x20x1M-8       1364.35      4189.79      3.07x
 BenchmarkReconstruct10x4x16M-8       1484.35      5779.53      3.89x
 ```
 # Performance on ARM64 NEON
 By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an ARM Cortex-A53 CPU @ 1.2GHz (Debian 8.0 Jessie running Go: 1.7.4):
 | Data | Parity | Parity | ARM64 Go MB/s | ARM64 NEON MB/s | NEON Speed |
 |------|--------|--------|--------------:|----------------:|-----------:|
 | 5    | 2      | 40%    |           189 |            1304 |       588% |
 | 10   | 2      | 20%    |           188 |            1738 |       925% |
 | 10   | 4      | 40%    |            96 |             839 |       877% |
 # asm2plan9s
 [asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
--- a/vendor/github.com/klauspost/reedsolomon/galois.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois.go
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
@ -17,7 +17,10 @@ func galMulAVX2Xor(low, high, in, out []byte)
 //go:noescape
 func galMulAVX2(low, high, in, out []byte)
-// This is what the assembler rountes does in blocks of 16 bytes:
+//go:noescape
 func sSE2XorSlice(in, out []byte)
 // This is what the assembler routines do in blocks of 16 bytes:
 /*
 func galMulSSSE3(low, high, in, out []byte) {
 	for n, input := range in {
@ -71,3 +74,18 @@ func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
 		}
 	}
 }
 // slice galois add
 func sliceXor(in, out []byte, sse2 bool) {
 	var done int
 	if sse2 {
 		sSE2XorSlice(in, out)
 		done = (len(in) >> 4) << 4
 	}
 	remain := len(in) - done
 	if remain > 0 {
 		for i := done; i < len(in); i++ {
 			out[i] ^= in[i]
 		}
 	}
 }
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
@ -162,3 +162,25 @@ done_avx2:
 	BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
 	RET
 // func sSE2XorSlice(in, out []byte)
 TEXT ·sSE2XorSlice(SB), 7, $0
 	MOVQ in+0(FP), SI     // SI: &in
 	MOVQ in_len+8(FP), R9 // R9: len(in)
 	MOVQ out+24(FP), DX   // DX: &out
 	SHRQ $4, R9           // len(in) / 16
 	CMPQ R9, $0
 	JEQ  done_xor_sse2
 loopback_xor_sse2:
 	MOVOU (SI), X0          // in[x]
 	MOVOU (DX), X1          // out[x]
 	PXOR  X0, X1
 	MOVOU X1, (DX)
 	ADDQ  $16, SI           // in+=16
 	ADDQ  $16, DX           // out+=16
 	SUBQ  $1, R9
 	JNZ   loopback_xor_sse2
 done_xor_sse2:
 	RET
--- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
@ -0,0 +1,48 @@
 //+build !noasm
 //+build !appengine
 // Copyright 2015, Klaus Post, see LICENSE for details.
 // Copyright 2017, Minio, Inc.
 package reedsolomon
 //go:noescape
 func galMulNEON(c uint64, in, out []byte)
 //go:noescape
 func galMulXorNEON(c uint64, in, out []byte)
 func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
 	var done int
 	galMulNEON(uint64(c), in, out)
 	done = (len(in) >> 5) << 5
 	remain := len(in) - done
 	if remain > 0 {
 		mt := mulTable[c]
 		for i := done; i < len(in); i++ {
 			out[i] = mt[in[i]]
 		}
 	}
 }
 func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
 	var done int
 	galMulXorNEON(uint64(c), in, out)
 	done = (len(in) >> 5) << 5
 	remain := len(in) - done
 	if remain > 0 {
 		mt := mulTable[c]
 		for i := done; i < len(in); i++ {
 			out[i] ^= mt[in[i]]
 		}
 	}
 }
 // slice galois add
 func sliceXor(in, out []byte, sse2 bool) {
 	for n, input := range in {
 		out[n] ^= input
 	}
 }
--- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
@ -0,0 +1,141 @@
 //+build !noasm !appengine
 // Copyright 2015, Klaus Post, see LICENSE for details.
 // Copyright 2017, Minio, Inc.
 // Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
 // the opcodes of their Plan9 equivalents
 // polynomial multiplication
 #define POLYNOMIAL_MULTIPLICATION \
 	WORD $0x0e3ce340 \ // pmull  v0.8h,v26.8b,v28.8b
 	WORD $0x4e3ce346 \ // pmull2 v6.8h,v26.16b,v28.16b
 	WORD $0x0e3ce36c \ // pmull  v12.8h,v27.8b,v28.8b
 	WORD $0x4e3ce372 // pmull2 v18.8h,v27.16b,v28.16b
 // first reduction
 #define FIRST_REDUCTION \
 	WORD $0x0f088402 \ // shrn  v2.8b, v0.8h, #8
 	WORD $0x0f0884c8 \ // shrn  v8.8b, v6.8h, #8
 	WORD $0x0f08858e \ // shrn  v14.8b, v12.8h, #8
 	WORD $0x0f088654 \ // shrn  v20.8b, v18.8h, #8
 	WORD $0x0e22e3c3 \ // pmull v3.8h,v30.8b,v2.8b
 	WORD $0x0e28e3c9 \ // pmull v9.8h,v30.8b,v8.8b
 	WORD $0x0e2ee3cf \ // pmull v15.8h,v30.8b,v14.8b
 	WORD $0x0e34e3d5 \ // pmull v21.8h,v30.8b,v20.8b
 	WORD $0x6e201c60 \ // eor   v0.16b,v3.16b,v0.16b
 	WORD $0x6e261d26 \ // eor   v6.16b,v9.16b,v6.16b
 	WORD $0x6e2c1dec \ // eor   v12.16b,v15.16b,v12.16b
 	WORD $0x6e321eb2 // eor   v18.16b,v21.16b,v18.16b
 // second reduction
 #define SECOND_REDUCTION \
 	WORD $0x0f088404 \ // shrn  v4.8b, v0.8h, #8
 	WORD $0x0f0884ca \ // shrn  v10.8b, v6.8h, #8
 	WORD $0x0f088590 \ // shrn  v16.8b, v12.8h, #8
 	WORD $0x0f088656 \ // shrn  v22.8b, v18.8h, #8
 	WORD $0x6e241c44 \ // eor   v4.16b,v2.16b,v4.16b
 	WORD $0x6e2a1d0a \ // eor   v10.16b,v8.16b,v10.16b
 	WORD $0x6e301dd0 \ // eor   v16.16b,v14.16b,v16.16b
 	WORD $0x6e361e96 \ // eor   v22.16b,v20.16b,v22.16b
 	WORD $0x0e24e3c5 \ // pmull v5.8h,v30.8b,v4.8b
 	WORD $0x0e2ae3cb \ // pmull v11.8h,v30.8b,v10.8b
 	WORD $0x0e30e3d1 \ // pmull v17.8h,v30.8b,v16.8b
 	WORD $0x0e36e3d7 \ // pmull v23.8h,v30.8b,v22.8b
 	WORD $0x6e201ca0 \ // eor   v0.16b,v5.16b,v0.16b
 	WORD $0x6e261d61 \ // eor   v1.16b,v11.16b,v6.16b
 	WORD $0x6e2c1e22 \ // eor   v2.16b,v17.16b,v12.16b
 	WORD $0x6e321ee3 // eor   v3.16b,v23.16b,v18.16b
 // func galMulNEON(c uint64, in, out []byte)
 TEXT ·galMulNEON(SB), 7, $0
 	MOVD c+0(FP), R0
 	MOVD in_base+8(FP), R1
 	MOVD in_len+16(FP), R2   // length of message
 	MOVD out_base+32(FP), R5
 	SUBS $32, R2
 	BMI  complete
 	// Load constants table pointer
 	MOVD $·constants(SB), R3
 	// and load constants into v30 & v31
 	WORD $0x4c40a07e // ld1    {v30.16b-v31.16b}, [x3]
 	WORD $0x4e010c1c // dup    v28.16b, w0
 loop:
 	// Main loop
 	WORD $0x4cdfa83a // ld1   {v26.4s-v27.4s}, [x1], #32
 	POLYNOMIAL_MULTIPLICATION
 	FIRST_REDUCTION
 	SECOND_REDUCTION
 	// combine results
 	WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
 	WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
 	// Store result
 	WORD $0x4c9faca0 // st1    {v0.2d-v1.2d}, [x5], #32
 	SUBS $32, R2
 	BPL  loop
 complete:
 	RET
 // func galMulXorNEON(c uint64, in, out []byte)
 TEXT ·galMulXorNEON(SB), 7, $0
 	MOVD c+0(FP), R0
 	MOVD in_base+8(FP), R1
 	MOVD in_len+16(FP), R2   // length of message
 	MOVD out_base+32(FP), R5
 	SUBS $32, R2
 	BMI  completeXor
 	// Load constants table pointer
 	MOVD $·constants(SB), R3
 	// and load constants into v30 & v31
 	WORD $0x4c40a07e // ld1    {v30.16b-v31.16b}, [x3]
 	WORD $0x4e010c1c // dup    v28.16b, w0
 loopXor:
 	// Main loop
 	WORD $0x4cdfa83a // ld1   {v26.4s-v27.4s}, [x1], #32
 	WORD $0x4c40a8b8 // ld1   {v24.4s-v25.4s}, [x5]
 	POLYNOMIAL_MULTIPLICATION
 	FIRST_REDUCTION
 	SECOND_REDUCTION
 	// combine results
 	WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
 	WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
 	// Xor result and store
 	WORD $0x6e381c00 // eor v0.16b,v0.16b,v24.16b
 	WORD $0x6e391c21 // eor v1.16b,v1.16b,v25.16b
 	WORD $0x4c9faca0 // st1   {v0.2d-v1.2d}, [x5], #32
 	SUBS $32, R2
 	BPL  loopXor
 completeXor:
 	RET
 // Constants table
 //   generating polynomial is 29 (= 0x1d)
 DATA ·constants+0x0(SB)/8, $0x1d1d1d1d1d1d1d1d
 DATA ·constants+0x8(SB)/8, $0x1d1d1d1d1d1d1d1d
 //   constant for TBL instruction
 DATA ·constants+0x10(SB)/8, $0x0e0c0a0806040200
 DATA ·constants+0x18(SB)/8, $0x1e1c1a1816141210
 GLOBL ·constants(SB), 8, $32
--- a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
@ -1,4 +1,5 @@
 //+build !amd64 noasm appengine
 //+build !arm64 noasm appengine
 // Copyright 2015, Klaus Post, see LICENSE for details.
@ -17,3 +18,10 @@ func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
 		out[n] ^= mt[input]
 	}
 }
 // slice galois add
 func sliceXor(in, out []byte, sse2 bool) {
 	for n, input := range in {
 		out[n] ^= input
 	}
 }
--- a/vendor/github.com/klauspost/reedsolomon/options.go
+++ b/vendor/github.com/klauspost/reedsolomon/options.go
@ -10,10 +10,10 @@ import (
 type Option func(*options)
 type options struct {
-	maxGoroutines     int
+	maxGoroutines              int
-	minSplitSize      int
+	minSplitSize               int
-	useAVX2, useSSSE3 bool
+	useAVX2, useSSSE3, useSSE2 bool
-	usePAR1Matrix     bool
+	usePAR1Matrix              bool
 }
 var defaultOptions = options{
@ -28,6 +28,7 @@ func init() {
 	// Detect CPU capabilities.
 	defaultOptions.useSSSE3 = cpuid.CPU.SSSE3()
 	defaultOptions.useAVX2 = cpuid.CPU.AVX2()
 	defaultOptions.useSSE2 = cpuid.CPU.SSE2()
 }
 // WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding.
@ -67,6 +68,12 @@ func withAVX2(enabled bool) Option {
 	}
 }
 func withSSE2(enabled bool) Option {
 	return func(o *options) {
 		o.useSSE2 = enabled
 	}
 }
 // WithPAR1Matrix causes the encoder to build the matrix how PARv1
 // does. Note that the method they use is buggy, and may lead to cases
 // where recovery is impossible, even if there are enough parity
--- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
+++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
@ -64,6 +64,14 @@ type Encoder interface {
 	// calling the Verify function is likely to fail.
 	ReconstructData(shards [][]byte) error
 	// Update parity is use for change a few data shards and update it's parity.
 	// Input 'newDatashards' containing data shards changed.
 	// Input 'shards' containing old data shards (if data shard not changed, it can be nil) and old parity shards.
 	// new parity shards will in shards[DataShards:]
 	// Update is very useful if  DataShards much larger than ParityShards and changed data shards is few. It will
 	// faster than Encode and not need read all data shards to encode.
 	Update(shards [][]byte, newDatashards [][]byte) error
 	// Split a data slice into the number of shards given to the encoder,
 	// and create empty parity shards.
 	//
@ -221,7 +229,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 }
 // ErrTooFewShards is returned if too few shards where given to
-// Encode/Verify/Reconstruct. It will also be returned from Reconstruct
+// Encode/Verify/Reconstruct/Update. It will also be returned from Reconstruct
 // if there were too few shards to reconstruct the missing data.
 var ErrTooFewShards = errors.New("too few shards given")
@ -249,6 +257,101 @@ func (r reedSolomon) Encode(shards [][]byte) error {
 	return nil
 }
 // ErrInvalidInput is returned if invalid input parameter of Update.
 var ErrInvalidInput = errors.New("invalid input")
 func (r reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error {
 	if len(shards) != r.Shards {
 		return ErrTooFewShards
 	}
 	if len(newDatashards) != r.DataShards {
 		return ErrTooFewShards
 	}
 	err := checkShards(shards, true)
 	if err != nil {
 		return err
 	}
 	err = checkShards(newDatashards, true)
 	if err != nil {
 		return err
 	}
 	for i := range newDatashards {
 		if newDatashards[i] != nil && shards[i] == nil {
 			return ErrInvalidInput
 		}
 	}
 	for _, p := range shards[r.DataShards:] {
 		if p == nil {
 			return ErrInvalidInput
 		}
 	}
 	shardSize := shardSize(shards)
 	// Get the slice of output buffers.
 	output := shards[r.DataShards:]
 	// Do the coding.
 	r.updateParityShards(r.parity, shards[0:r.DataShards], newDatashards[0:r.DataShards], output, r.ParityShards, shardSize)
 	return nil
 }
 func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
 	if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
 		r.updateParityShardsP(matrixRows, oldinputs, newinputs, outputs, outputCount, byteCount)
 		return
 	}
 	for c := 0; c < r.DataShards; c++ {
 		in := newinputs[c]
 		if in == nil {
 			continue
 		}
 		oldin := oldinputs[c]
 		// oldinputs data will be change
 		sliceXor(in, oldin, r.o.useSSE2)
 		for iRow := 0; iRow < outputCount; iRow++ {
 			galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
 		}
 	}
 }
 func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
 	var wg sync.WaitGroup
 	do := byteCount / r.o.maxGoroutines
 	if do < r.o.minSplitSize {
 		do = r.o.minSplitSize
 	}
 	start := 0
 	for start < byteCount {
 		if start+do > byteCount {
 			do = byteCount - start
 		}
 		wg.Add(1)
 		go func(start, stop int) {
 			for c := 0; c < r.DataShards; c++ {
 				in := newinputs[c]
 				if in == nil {
 					continue
 				}
 				oldin := oldinputs[c]
 				// oldinputs data will be change
 				sliceXor(in[start:stop], oldin[start:stop], r.o.useSSE2)
 				for iRow := 0; iRow < outputCount; iRow++ {
 					galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
 				}
 			}
 			wg.Done()
 		}(start, start+do)
 		start += do
 	}
 	wg.Wait()
 }
 // Verify returns true if the parity shards contain the right data.
 // The data is the same format as Encode. No data is modified.
 func (r reedSolomon) Verify(shards [][]byte) (bool, error) {
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@ -243,10 +243,10 @@
 			"revisionTime": "2016-10-16T15:41:25Z"
 		},
 		{
-			"checksumSHA1": "gYAsuckCW3o4veePKZzEHvCcJro=",
+			"checksumSHA1": "R9saYJznxosfknAq2aPnVKxqI3w=",
 			"path": "github.com/klauspost/reedsolomon",
-			"revision": "48a4fd05f1730dd3ef9c3f9e943f6091d063f2c4",
+			"revision": "87ba8262ab3d167ae4d38e22796312cd2a9d0b19",
-			"revisionTime": "2017-07-22T14:16:58Z"
+			"revisionTime": "2017-08-26T09:54:10Z"
 		},
 		{
 			"checksumSHA1": "dNYxHiBLalTqluak2/Z8c3RsSEM=",