Updated version of klauspost/reedsolomon with NEON support for ARM (#4865)

7 years ago · 93f126364e
parent 6dca044ea8
commit 93f126364e
10 changed files with 1143 additions and 15 deletions
--- a/vendor/github.com/klauspost/reedsolomon/README.md
+++ b/vendor/github.com/klauspost/reedsolomon/README.md
@ -8,7 +8,7 @@

 Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go.

-This is a golang port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations.
+This is a Go port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReedSolomon) library released by [Backblaze](http://backblaze.com), with some additional optimizations.

 For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/).

@ -19,11 +19,17 @@ Godoc: https://godoc.org/github.com/klauspost/reedsolomon
 # Installation
 To get the package use the standard:
 ```bash
-go get github.com/klauspost/reedsolomon
+go get -u github.com/klauspost/reedsolomon
 ```

 # Changes

+## August 26, 2017
+
+*  The[`Encoder()`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) now contains an `Update` function contributed by [chenzhongtao](https://github.com/chenzhongtao).
+
+* [Frank Wessels](https://github.com/fwessels) kindly contributed ARM 64 bit assembly, which gives a huge performance boost on this platform.
+
 ## July 20, 2017

 `ReconstructData` added to [`Encoder`](https://godoc.org/github.com/klauspost/reedsolomon#Encoder) interface. This can cause compatibility issues if you implement your own Encoder. A simple workaround can be added:
@ -186,7 +192,7 @@ There is no buffering or timeouts/retry specified. If you want to add that, you

 For complete examples of a streaming encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples).

-#Advanced Options
+# Advanced Options

 You can modify internal options which affects how jobs are split between and processed by goroutines.

@ -234,6 +240,16 @@ BenchmarkReconstruct50x20x1M-8       1364.35      4189.79      3.07x
 BenchmarkReconstruct10x4x16M-8       1484.35      5779.53      3.89x
 ```

+# Performance on ARM64 NEON
+
+By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an ARM Cortex-A53 CPU @ 1.2GHz (Debian 8.0 Jessie running Go: 1.7.4):
+
+| Data | Parity | Parity | ARM64 Go MB/s | ARM64 NEON MB/s | NEON Speed |
+|------|--------|--------|--------------:|----------------:|-----------:|
+| 5    | 2      | 40%    |           189 |            1304 |       588% |
+| 10   | 2      | 20%    |           188 |            1738 |       925% |
+| 10   | 4      | 40%    |            96 |             839 |       877% |
+
 # asm2plan9s

 [asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents.
--- a/vendor/github.com/klauspost/reedsolomon/galois.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois.go
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go
@ -17,7 +17,10 @@ func galMulAVX2Xor(low, high, in, out []byte)
 //go:noescape
 func galMulAVX2(low, high, in, out []byte)

-// This is what the assembler rountes does in blocks of 16 bytes:
+//go:noescape
+func sSE2XorSlice(in, out []byte)
+
+// This is what the assembler routines do in blocks of 16 bytes:
 /*
 func galMulSSSE3(low, high, in, out []byte) {
 	for n, input := range in {
@ -71,3 +74,18 @@ func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
 		}
 	}
 }
+
+// slice galois add
+func sliceXor(in, out []byte, sse2 bool) {
+	var done int
+	if sse2 {
+		sSE2XorSlice(in, out)
+		done = (len(in) >> 4) << 4
+	}
+	remain := len(in) - done
+	if remain > 0 {
+		for i := done; i < len(in); i++ {
+			out[i] ^= in[i]
+		}
+	}
+}
--- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.s
@ -162,3 +162,25 @@ done_avx2:

 	BYTE $0xc5; BYTE $0xf8; BYTE $0x77 // VZEROUPPER
 	RET
+
+// func sSE2XorSlice(in, out []byte)
+TEXT ·sSE2XorSlice(SB), 7, $0
+	MOVQ in+0(FP), SI     // SI: &in
+	MOVQ in_len+8(FP), R9 // R9: len(in)
+	MOVQ out+24(FP), DX   // DX: &out
+	SHRQ $4, R9           // len(in) / 16
+	CMPQ R9, $0
+	JEQ  done_xor_sse2
+
+loopback_xor_sse2:
+	MOVOU (SI), X0          // in[x]
+	MOVOU (DX), X1          // out[x]
+	PXOR  X0, X1
+	MOVOU X1, (DX)
+	ADDQ  $16, SI           // in+=16
+	ADDQ  $16, DX           // out+=16
+	SUBQ  $1, R9
+	JNZ   loopback_xor_sse2
+
+done_xor_sse2:
+	RET
--- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go
@ -0,0 +1,48 @@
+//+build !noasm
+//+build !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2017, Minio, Inc.
+
+package reedsolomon
+
+//go:noescape
+func galMulNEON(c uint64, in, out []byte)
+
+//go:noescape
+func galMulXorNEON(c uint64, in, out []byte)
+
+func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
+	var done int
+	galMulNEON(uint64(c), in, out)
+	done = (len(in) >> 5) << 5
+
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] = mt[in[i]]
+		}
+	}
+}
+
+func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
+	var done int
+	galMulXorNEON(uint64(c), in, out)
+	done = (len(in) >> 5) << 5
+
+	remain := len(in) - done
+	if remain > 0 {
+		mt := mulTable[c]
+		for i := done; i < len(in); i++ {
+			out[i] ^= mt[in[i]]
+		}
+	}
+}
+
+// slice galois add
+func sliceXor(in, out []byte, sse2 bool) {
+	for n, input := range in {
+		out[n] ^= input
+	}
+}
--- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
+++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.s
@ -0,0 +1,141 @@
+//+build !noasm !appengine
+
+// Copyright 2015, Klaus Post, see LICENSE for details.
+// Copyright 2017, Minio, Inc.
+
+// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
+// the opcodes of their Plan9 equivalents
+
+// polynomial multiplication
+#define POLYNOMIAL_MULTIPLICATION \
+	WORD $0x0e3ce340 \ // pmull  v0.8h,v26.8b,v28.8b
+	WORD $0x4e3ce346 \ // pmull2 v6.8h,v26.16b,v28.16b
+	WORD $0x0e3ce36c \ // pmull  v12.8h,v27.8b,v28.8b
+	WORD $0x4e3ce372 // pmull2 v18.8h,v27.16b,v28.16b
+
+// first reduction
+#define FIRST_REDUCTION \
+	WORD $0x0f088402 \ // shrn  v2.8b, v0.8h, #8
+	WORD $0x0f0884c8 \ // shrn  v8.8b, v6.8h, #8
+	WORD $0x0f08858e \ // shrn  v14.8b, v12.8h, #8
+	WORD $0x0f088654 \ // shrn  v20.8b, v18.8h, #8
+	WORD $0x0e22e3c3 \ // pmull v3.8h,v30.8b,v2.8b
+	WORD $0x0e28e3c9 \ // pmull v9.8h,v30.8b,v8.8b
+	WORD $0x0e2ee3cf \ // pmull v15.8h,v30.8b,v14.8b
+	WORD $0x0e34e3d5 \ // pmull v21.8h,v30.8b,v20.8b
+	WORD $0x6e201c60 \ // eor   v0.16b,v3.16b,v0.16b
+	WORD $0x6e261d26 \ // eor   v6.16b,v9.16b,v6.16b
+	WORD $0x6e2c1dec \ // eor   v12.16b,v15.16b,v12.16b
+	WORD $0x6e321eb2 // eor   v18.16b,v21.16b,v18.16b
+
+// second reduction
+#define SECOND_REDUCTION \
+	WORD $0x0f088404 \ // shrn  v4.8b, v0.8h, #8
+	WORD $0x0f0884ca \ // shrn  v10.8b, v6.8h, #8
+	WORD $0x0f088590 \ // shrn  v16.8b, v12.8h, #8
+	WORD $0x0f088656 \ // shrn  v22.8b, v18.8h, #8
+	WORD $0x6e241c44 \ // eor   v4.16b,v2.16b,v4.16b
+	WORD $0x6e2a1d0a \ // eor   v10.16b,v8.16b,v10.16b
+	WORD $0x6e301dd0 \ // eor   v16.16b,v14.16b,v16.16b
+	WORD $0x6e361e96 \ // eor   v22.16b,v20.16b,v22.16b
+	WORD $0x0e24e3c5 \ // pmull v5.8h,v30.8b,v4.8b
+	WORD $0x0e2ae3cb \ // pmull v11.8h,v30.8b,v10.8b
+	WORD $0x0e30e3d1 \ // pmull v17.8h,v30.8b,v16.8b
+	WORD $0x0e36e3d7 \ // pmull v23.8h,v30.8b,v22.8b
+	WORD $0x6e201ca0 \ // eor   v0.16b,v5.16b,v0.16b
+	WORD $0x6e261d61 \ // eor   v1.16b,v11.16b,v6.16b
+	WORD $0x6e2c1e22 \ // eor   v2.16b,v17.16b,v12.16b
+	WORD $0x6e321ee3 // eor   v3.16b,v23.16b,v18.16b
+
+// func galMulNEON(c uint64, in, out []byte)
+TEXT ·galMulNEON(SB), 7, $0
+	MOVD c+0(FP), R0
+	MOVD in_base+8(FP), R1
+	MOVD in_len+16(FP), R2   // length of message
+	MOVD out_base+32(FP), R5
+	SUBS $32, R2
+	BMI  complete
+
+	// Load constants table pointer
+	MOVD $·constants(SB), R3
+
+	// and load constants into v30 & v31
+	WORD $0x4c40a07e // ld1    {v30.16b-v31.16b}, [x3]
+
+	WORD $0x4e010c1c // dup    v28.16b, w0
+
+loop:
+	// Main loop
+	WORD $0x4cdfa83a // ld1   {v26.4s-v27.4s}, [x1], #32
+
+	POLYNOMIAL_MULTIPLICATION
+
+	FIRST_REDUCTION
+
+	SECOND_REDUCTION
+
+	// combine results
+	WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
+	WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
+
+	// Store result
+	WORD $0x4c9faca0 // st1    {v0.2d-v1.2d}, [x5], #32
+
+	SUBS $32, R2
+	BPL  loop
+
+complete:
+	RET
+
+// func galMulXorNEON(c uint64, in, out []byte)
+TEXT ·galMulXorNEON(SB), 7, $0
+	MOVD c+0(FP), R0
+	MOVD in_base+8(FP), R1
+	MOVD in_len+16(FP), R2   // length of message
+	MOVD out_base+32(FP), R5
+	SUBS $32, R2
+	BMI  completeXor
+
+	// Load constants table pointer
+	MOVD $·constants(SB), R3
+
+	// and load constants into v30 & v31
+	WORD $0x4c40a07e // ld1    {v30.16b-v31.16b}, [x3]
+
+	WORD $0x4e010c1c // dup    v28.16b, w0
+
+loopXor:
+	// Main loop
+	WORD $0x4cdfa83a // ld1   {v26.4s-v27.4s}, [x1], #32
+	WORD $0x4c40a8b8 // ld1   {v24.4s-v25.4s}, [x5]
+
+	POLYNOMIAL_MULTIPLICATION
+
+	FIRST_REDUCTION
+
+	SECOND_REDUCTION
+
+	// combine results
+	WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
+	WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
+
+	// Xor result and store
+	WORD $0x6e381c00 // eor v0.16b,v0.16b,v24.16b
+	WORD $0x6e391c21 // eor v1.16b,v1.16b,v25.16b
+	WORD $0x4c9faca0 // st1   {v0.2d-v1.2d}, [x5], #32
+
+	SUBS $32, R2
+	BPL  loopXor
+
+completeXor:
+	RET
+
+// Constants table
+//   generating polynomial is 29 (= 0x1d)
+DATA ·constants+0x0(SB)/8, $0x1d1d1d1d1d1d1d1d
+DATA ·constants+0x8(SB)/8, $0x1d1d1d1d1d1d1d1d
+//   constant for TBL instruction
+DATA ·constants+0x10(SB)/8, $0x0e0c0a0806040200
+DATA ·constants+0x18(SB)/8, $0x1e1c1a1816141210
+
+GLOBL ·constants(SB), 8, $32
--- a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
+++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go
@ -1,4 +1,5 @@
 //+build !amd64 noasm appengine
+//+build !arm64 noasm appengine

 // Copyright 2015, Klaus Post, see LICENSE for details.

@ -17,3 +18,10 @@ func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
 		out[n] ^= mt[input]
 	}
 }
+
+// slice galois add
+func sliceXor(in, out []byte, sse2 bool) {
+	for n, input := range in {
+		out[n] ^= input
+	}
+}
--- a/vendor/github.com/klauspost/reedsolomon/options.go
+++ b/vendor/github.com/klauspost/reedsolomon/options.go
@ -12,7 +12,7 @@ type Option func(*options)
 type options struct {
 	maxGoroutines              int
 	minSplitSize               int
-	useAVX2, useSSSE3 bool
+	useAVX2, useSSSE3, useSSE2 bool
 	usePAR1Matrix              bool
 }

@ -28,6 +28,7 @@ func init() {
 	// Detect CPU capabilities.
 	defaultOptions.useSSSE3 = cpuid.CPU.SSSE3()
 	defaultOptions.useAVX2 = cpuid.CPU.AVX2()
+	defaultOptions.useSSE2 = cpuid.CPU.SSE2()
 }

 // WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding.
@ -67,6 +68,12 @@ func withAVX2(enabled bool) Option {
 	}
 }

+func withSSE2(enabled bool) Option {
+	return func(o *options) {
+		o.useSSE2 = enabled
+	}
+}
+
 // WithPAR1Matrix causes the encoder to build the matrix how PARv1
 // does. Note that the method they use is buggy, and may lead to cases
 // where recovery is impossible, even if there are enough parity
--- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
+++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go
@ -64,6 +64,14 @@ type Encoder interface {
 	// calling the Verify function is likely to fail.
 	ReconstructData(shards [][]byte) error

+	// Update parity is use for change a few data shards and update it's parity.
+	// Input 'newDatashards' containing data shards changed.
+	// Input 'shards' containing old data shards (if data shard not changed, it can be nil) and old parity shards.
+	// new parity shards will in shards[DataShards:]
+	// Update is very useful if  DataShards much larger than ParityShards and changed data shards is few. It will
+	// faster than Encode and not need read all data shards to encode.
+	Update(shards [][]byte, newDatashards [][]byte) error
+
 	// Split a data slice into the number of shards given to the encoder,
 	// and create empty parity shards.
 	//
@ -221,7 +229,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 }

 // ErrTooFewShards is returned if too few shards where given to
-// Encode/Verify/Reconstruct. It will also be returned from Reconstruct
+// Encode/Verify/Reconstruct/Update. It will also be returned from Reconstruct
 // if there were too few shards to reconstruct the missing data.
 var ErrTooFewShards = errors.New("too few shards given")

@ -249,6 +257,101 @@ func (r reedSolomon) Encode(shards [][]byte) error {
 	return nil
 }

+// ErrInvalidInput is returned if invalid input parameter of Update.
+var ErrInvalidInput = errors.New("invalid input")
+
+func (r reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error {
+	if len(shards) != r.Shards {
+		return ErrTooFewShards
+	}
+
+	if len(newDatashards) != r.DataShards {
+		return ErrTooFewShards
+	}
+
+	err := checkShards(shards, true)
+	if err != nil {
+		return err
+	}
+
+	err = checkShards(newDatashards, true)
+	if err != nil {
+		return err
+	}
+
+	for i := range newDatashards {
+		if newDatashards[i] != nil && shards[i] == nil {
+			return ErrInvalidInput
+		}
+	}
+	for _, p := range shards[r.DataShards:] {
+		if p == nil {
+			return ErrInvalidInput
+		}
+	}
+
+	shardSize := shardSize(shards)
+
+	// Get the slice of output buffers.
+	output := shards[r.DataShards:]
+
+	// Do the coding.
+	r.updateParityShards(r.parity, shards[0:r.DataShards], newDatashards[0:r.DataShards], output, r.ParityShards, shardSize)
+	return nil
+}
+
+func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
+	if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
+		r.updateParityShardsP(matrixRows, oldinputs, newinputs, outputs, outputCount, byteCount)
+		return
+	}
+
+	for c := 0; c < r.DataShards; c++ {
+		in := newinputs[c]
+		if in == nil {
+			continue
+		}
+		oldin := oldinputs[c]
+		// oldinputs data will be change
+		sliceXor(in, oldin, r.o.useSSE2)
+		for iRow := 0; iRow < outputCount; iRow++ {
+			galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
+		}
+	}
+}
+
+func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
+	var wg sync.WaitGroup
+	do := byteCount / r.o.maxGoroutines
+	if do < r.o.minSplitSize {
+		do = r.o.minSplitSize
+	}
+	start := 0
+	for start < byteCount {
+		if start+do > byteCount {
+			do = byteCount - start
+		}
+		wg.Add(1)
+		go func(start, stop int) {
+			for c := 0; c < r.DataShards; c++ {
+				in := newinputs[c]
+				if in == nil {
+					continue
+				}
+				oldin := oldinputs[c]
+				// oldinputs data will be change
+				sliceXor(in[start:stop], oldin[start:stop], r.o.useSSE2)
+				for iRow := 0; iRow < outputCount; iRow++ {
+					galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
+				}
+			}
+			wg.Done()
+		}(start, start+do)
+		start += do
+	}
+	wg.Wait()
+}
+
 // Verify returns true if the parity shards contain the right data.
 // The data is the same format as Encode. No data is modified.
 func (r reedSolomon) Verify(shards [][]byte) (bool, error) {
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@ -243,10 +243,10 @@
 			"revisionTime": "2016-10-16T15:41:25Z"
 		},
 		{
-			"checksumSHA1": "gYAsuckCW3o4veePKZzEHvCcJro=",
+			"checksumSHA1": "R9saYJznxosfknAq2aPnVKxqI3w=",
 			"path": "github.com/klauspost/reedsolomon",
-			"revision": "48a4fd05f1730dd3ef9c3f9e943f6091d063f2c4",
-			"revisionTime": "2017-07-22T14:16:58Z"
+			"revision": "87ba8262ab3d167ae4d38e22796312cd2a9d0b19",
+			"revisionTime": "2017-08-26T09:54:10Z"
 		},
 		{
 			"checksumSHA1": "dNYxHiBLalTqluak2/Z8c3RsSEM=",