minio/vendor/github.com/klauspost/reedsolomon/galois_amd64.s

//+build !noasm !appengine

// Copyright 2015, Klaus Post, see LICENSE for details.

// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
// and http://jerasure.org/jerasure/gf-complete/tree/master

// func galMulSSSE3Xor(low, high, in, out []byte)
TEXT ·galMulSSSE3Xor(SB), 7, $0
	MOVQ   low+0(FP), SI     // SI: &low
	MOVQ   high+24(FP), DX   // DX: &high
	MOVOU  (SI), X6          // X6 low
	MOVOU  (DX), X7          // X7: high
	MOVQ   $15, BX           // BX: low mask
	MOVQ   BX, X8
	PXOR   X5, X5
	MOVQ   in+48(FP), SI     // R11: &in
	MOVQ   in_len+56(FP), R9 // R9: len(in)
	MOVQ   out+72(FP), DX    // DX: &out
	PSHUFB X5, X8            // X8: lomask (unpacked)
	SHRQ   $4, R9            // len(in) / 16
	MOVQ   SI, AX
	MOVQ   DX, BX
	ANDQ   $15, AX
	ANDQ   $15, BX
	CMPQ   R9, $0
	JEQ    done_xor
	ORQ    AX, BX
	CMPQ   BX, $0
	JNZ    loopback_xor

loopback_xor_aligned:
	MOVOA  (SI), X0             // in[x]
	MOVOA  (DX), X4             // out[x]
	MOVOA  X0, X1               // in[x]
	MOVOA  X6, X2               // low copy
	MOVOA  X7, X3               // high copy
	PSRLQ  $4, X1               // X1: high input
	PAND   X8, X0               // X0: low input
	PAND   X8, X1               // X0: high input
	PSHUFB X0, X2               // X2: mul low part
	PSHUFB X1, X3               // X3: mul high part
	PXOR   X2, X3               // X3: Result
	PXOR   X4, X3               // X3: Result xor existing out
	MOVOA  X3, (DX)             // Store
	ADDQ   $16, SI              // in+=16
	ADDQ   $16, DX              // out+=16
	SUBQ   $1, R9
	JNZ    loopback_xor_aligned
	JMP    done_xor

loopback_xor:
	MOVOU  (SI), X0     // in[x]
	MOVOU  (DX), X4     // out[x]
	MOVOU  X0, X1       // in[x]
	MOVOU  X6, X2       // low copy
	MOVOU  X7, X3       // high copy
	PSRLQ  $4, X1       // X1: high input
	PAND   X8, X0       // X0: low input
	PAND   X8, X1       // X0: high input
	PSHUFB X0, X2       // X2: mul low part
	PSHUFB X1, X3       // X3: mul high part
	PXOR   X2, X3       // X3: Result
	PXOR   X4, X3       // X3: Result xor existing out
	MOVOU  X3, (DX)     // Store
	ADDQ   $16, SI      // in+=16
	ADDQ   $16, DX      // out+=16
	SUBQ   $1, R9
	JNZ    loopback_xor

done_xor:
	RET

// func galMulSSSE3(low, high, in, out []byte)
TEXT ·galMulSSSE3(SB), 7, $0
	MOVQ   low+0(FP), SI     // SI: &low
	MOVQ   high+24(FP), DX   // DX: &high
	MOVOU  (SI), X6          // X6 low
	MOVOU  (DX), X7          // X7: high
	MOVQ   $15, BX           // BX: low mask
	MOVQ   BX, X8
	PXOR   X5, X5
	MOVQ   in+48(FP), SI     // R11: &in
	MOVQ   in_len+56(FP), R9 // R9: len(in)
	MOVQ   out+72(FP), DX    // DX: &out
	PSHUFB X5, X8            // X8: lomask (unpacked)
	MOVQ   SI, AX
	MOVQ   DX, BX
	SHRQ   $4, R9            // len(in) / 16
	ANDQ   $15, AX
	ANDQ   $15, BX
	CMPQ   R9, $0
	JEQ    done
	ORQ    AX, BX
	CMPQ   BX, $0
	JNZ    loopback

loopback_aligned:
	MOVOA  (SI), X0         // in[x]
	MOVOA  X0, X1           // in[x]
	MOVOA  X6, X2           // low copy
	MOVOA  X7, X3           // high copy
	PSRLQ  $4, X1           // X1: high input
	PAND   X8, X0           // X0: low input
	PAND   X8, X1           // X0: high input
	PSHUFB X0, X2           // X2: mul low part
	PSHUFB X1, X3           // X3: mul high part
	PXOR   X2, X3           // X3: Result
	MOVOA  X3, (DX)         // Store
	ADDQ   $16, SI          // in+=16
	ADDQ   $16, DX          // out+=16
	SUBQ   $1, R9
	JNZ    loopback_aligned
	JMP    done

loopback:
	MOVOU  (SI), X0 // in[x]
	MOVOU  X0, X1   // in[x]
	MOVOA  X6, X2   // low copy
	MOVOA  X7, X3   // high copy
	PSRLQ  $4, X1   // X1: high input
	PAND   X8, X0   // X0: low input
	PAND   X8, X1   // X0: high input
	PSHUFB X0, X2   // X2: mul low part
	PSHUFB X1, X3   // X3: mul high part
	PXOR   X2, X3   // X3: Result
	MOVOU  X3, (DX) // Store
	ADDQ   $16, SI  // in+=16
	ADDQ   $16, DX  // out+=16
	SUBQ   $1, R9
	JNZ    loopback

done:
	RET

// func galMulAVX2Xor(low, high, in, out []byte)
TEXT ·galMulAVX2Xor(SB), 7, $0
	MOVQ  low+0(FP), SI     // SI: &low
	MOVQ  high+24(FP), DX   // DX: &high
	MOVQ  $15, BX           // BX: low mask
	MOVQ  BX, X5
	MOVOU (SI), X6          // X6: low
	MOVOU (DX), X7          // X7: high
	MOVQ  in_len+56(FP), R9 // R9: len(in)

	VINSERTI128  $1, X6, Y6, Y6 // low
	VINSERTI128  $1, X7, Y7, Y7 // high
	VPBROADCASTB X5, Y8         // Y8: lomask (unpacked)

	SHRQ  $5, R9         // len(in) / 32
	MOVQ  out+72(FP), DX // DX: &out
	MOVQ  in+48(FP), SI  // SI: &in
	TESTQ R9, R9
	JZ    done_xor_avx2

loopback_xor_avx2:
	VMOVDQU (SI), Y0
	VMOVDQU (DX), Y4
	VPSRLQ  $4, Y0, Y1 // Y1: high input
	VPAND   Y8, Y0, Y0 // Y0: low input
	VPAND   Y8, Y1, Y1 // Y1: high input
	VPSHUFB Y0, Y6, Y2 // Y2: mul low part
	VPSHUFB Y1, Y7, Y3 // Y3: mul high part
	VPXOR   Y3, Y2, Y3 // Y3: Result
	VPXOR   Y4, Y3, Y4 // Y4: Result
	VMOVDQU Y4, (DX)

	ADDQ $32, SI           // in+=32
	ADDQ $32, DX           // out+=32
	SUBQ $1, R9
	JNZ  loopback_xor_avx2

done_xor_avx2:
	VZEROUPPER
	RET

// func galMulAVX2(low, high, in, out []byte)
TEXT ·galMulAVX2(SB), 7, $0
	MOVQ  low+0(FP), SI     // SI: &low
	MOVQ  high+24(FP), DX   // DX: &high
	MOVQ  $15, BX           // BX: low mask
	MOVQ  BX, X5
	MOVOU (SI), X6          // X6: low
	MOVOU (DX), X7          // X7: high
	MOVQ  in_len+56(FP), R9 // R9: len(in)

	VINSERTI128  $1, X6, Y6, Y6 // low
	VINSERTI128  $1, X7, Y7, Y7 // high
	VPBROADCASTB X5, Y8         // Y8: lomask (unpacked)

	SHRQ  $5, R9         // len(in) / 32
	MOVQ  out+72(FP), DX // DX: &out
	MOVQ  in+48(FP), SI  // SI: &in
	TESTQ R9, R9
	JZ    done_avx2

loopback_avx2:
	VMOVDQU (SI), Y0
	VPSRLQ  $4, Y0, Y1 // Y1: high input
	VPAND   Y8, Y0, Y0 // Y0: low input
	VPAND   Y8, Y1, Y1 // Y1: high input
	VPSHUFB Y0, Y6, Y2 // Y2: mul low part
	VPSHUFB Y1, Y7, Y3 // Y3: mul high part
	VPXOR   Y3, Y2, Y4 // Y4: Result
	VMOVDQU Y4, (DX)

	ADDQ $32, SI       // in+=32
	ADDQ $32, DX       // out+=32
	SUBQ $1, R9
	JNZ  loopback_avx2

done_avx2:
	VZEROUPPER
	RET

// func sSE2XorSlice(in, out []byte)
TEXT ·sSE2XorSlice(SB), 7, $0
	MOVQ in+0(FP), SI     // SI: &in
	MOVQ in_len+8(FP), R9 // R9: len(in)
	MOVQ out+24(FP), DX   // DX: &out
	SHRQ $4, R9           // len(in) / 16
	CMPQ R9, $0
	JEQ  done_xor_sse2

loopback_xor_sse2:
	MOVOU (SI), X0          // in[x]
	MOVOU (DX), X1          // out[x]
	PXOR  X0, X1
	MOVOU X1, (DX)
	ADDQ  $16, SI           // in+=16
	ADDQ  $16, DX           // out+=16
	SUBQ  $1, R9
	JNZ   loopback_xor_sse2

done_xor_sse2:
	RET
Implement XL layer - preliminary work. 9 years ago			`//+build !noasm !appengine`

			`// Copyright 2015, Klaus Post, see LICENSE for details.`

			`// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf`
			`// and http://jerasure.org/jerasure/gf-complete/tree/master`

			`// func galMulSSSE3Xor(low, high, in, out []byte)`
			`TEXT ·galMulSSSE3Xor(SB), 7, $0`
			`MOVQ low+0(FP), SI // SI: &low`
			`MOVQ high+24(FP), DX // DX: &high`
			`MOVOU (SI), X6 // X6 low`
			`MOVOU (DX), X7 // X7: high`
			`MOVQ $15, BX // BX: low mask`
			`MOVQ BX, X8`
			`PXOR X5, X5`
			`MOVQ in+48(FP), SI // R11: &in`
			`MOVQ in_len+56(FP), R9 // R9: len(in)`
			`MOVQ out+72(FP), DX // DX: &out`
			`PSHUFB X5, X8 // X8: lomask (unpacked)`
			`SHRQ $4, R9 // len(in) / 16`
Automatically set goroutines based on shardSize (#5346) Update reedsolomon library to enable feature to automatically set number of go-routines based on the input shard size, since shard size is sort of a constant in Minio for objects > 10MiB (default blocksize) klauspost reported around 15-20% improvement in performance numbers on older systems such as AVX and SSE3 ``` name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10) ``` 7 years ago			`MOVQ SI, AX`
			`MOVQ DX, BX`
			`ANDQ $15, AX`
			`ANDQ $15, BX`
Implement XL layer - preliminary work. 9 years ago			`CMPQ R9, $0`
			`JEQ done_xor`
Automatically set goroutines based on shardSize (#5346) Update reedsolomon library to enable feature to automatically set number of go-routines based on the input shard size, since shard size is sort of a constant in Minio for objects > 10MiB (default blocksize) klauspost reported around 15-20% improvement in performance numbers on older systems such as AVX and SSE3 ``` name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10) ``` 7 years ago			`ORQ AX, BX`
			`CMPQ BX, $0`
			`JNZ loopback_xor`

			`loopback_xor_aligned:`
			`MOVOA (SI), X0 // in[x]`
			`MOVOA (DX), X4 // out[x]`
			`MOVOA X0, X1 // in[x]`
			`MOVOA X6, X2 // low copy`
			`MOVOA X7, X3 // high copy`
			`PSRLQ $4, X1 // X1: high input`
			`PAND X8, X0 // X0: low input`
			`PAND X8, X1 // X0: high input`
			`PSHUFB X0, X2 // X2: mul low part`
			`PSHUFB X1, X3 // X3: mul high part`
			`PXOR X2, X3 // X3: Result`
			`PXOR X4, X3 // X3: Result xor existing out`
			`MOVOA X3, (DX) // Store`
			`ADDQ $16, SI // in+=16`
			`ADDQ $16, DX // out+=16`
			`SUBQ $1, R9`
			`JNZ loopback_xor_aligned`
			`JMP done_xor`
Implement XL layer - preliminary work. 9 years ago
			`loopback_xor:`
			`MOVOU (SI), X0 // in[x]`
			`MOVOU (DX), X4 // out[x]`
			`MOVOU X0, X1 // in[x]`
			`MOVOU X6, X2 // low copy`
			`MOVOU X7, X3 // high copy`
			`PSRLQ $4, X1 // X1: high input`
			`PAND X8, X0 // X0: low input`
			`PAND X8, X1 // X0: high input`
			`PSHUFB X0, X2 // X2: mul low part`
			`PSHUFB X1, X3 // X3: mul high part`
			`PXOR X2, X3 // X3: Result`
			`PXOR X4, X3 // X3: Result xor existing out`
			`MOVOU X3, (DX) // Store`
			`ADDQ $16, SI // in+=16`
			`ADDQ $16, DX // out+=16`
			`SUBQ $1, R9`
			`JNZ loopback_xor`

			`done_xor:`
			`RET`

			`// func galMulSSSE3(low, high, in, out []byte)`
			`TEXT ·galMulSSSE3(SB), 7, $0`
			`MOVQ low+0(FP), SI // SI: &low`
			`MOVQ high+24(FP), DX // DX: &high`
			`MOVOU (SI), X6 // X6 low`
			`MOVOU (DX), X7 // X7: high`
			`MOVQ $15, BX // BX: low mask`
			`MOVQ BX, X8`
			`PXOR X5, X5`
			`MOVQ in+48(FP), SI // R11: &in`
			`MOVQ in_len+56(FP), R9 // R9: len(in)`
			`MOVQ out+72(FP), DX // DX: &out`
			`PSHUFB X5, X8 // X8: lomask (unpacked)`
Automatically set goroutines based on shardSize (#5346) Update reedsolomon library to enable feature to automatically set number of go-routines based on the input shard size, since shard size is sort of a constant in Minio for objects > 10MiB (default blocksize) klauspost reported around 15-20% improvement in performance numbers on older systems such as AVX and SSE3 ``` name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10) ``` 7 years ago			`MOVQ SI, AX`
			`MOVQ DX, BX`
Implement XL layer - preliminary work. 9 years ago			`SHRQ $4, R9 // len(in) / 16`
Automatically set goroutines based on shardSize (#5346) Update reedsolomon library to enable feature to automatically set number of go-routines based on the input shard size, since shard size is sort of a constant in Minio for objects > 10MiB (default blocksize) klauspost reported around 15-20% improvement in performance numbers on older systems such as AVX and SSE3 ``` name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10) ``` 7 years ago			`ANDQ $15, AX`
			`ANDQ $15, BX`
Implement XL layer - preliminary work. 9 years ago			`CMPQ R9, $0`
			`JEQ done`
Automatically set goroutines based on shardSize (#5346) Update reedsolomon library to enable feature to automatically set number of go-routines based on the input shard size, since shard size is sort of a constant in Minio for objects > 10MiB (default blocksize) klauspost reported around 15-20% improvement in performance numbers on older systems such as AVX and SSE3 ``` name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10) ``` 7 years ago			`ORQ AX, BX`
			`CMPQ BX, $0`
			`JNZ loopback`

			`loopback_aligned:`
			`MOVOA (SI), X0 // in[x]`
			`MOVOA X0, X1 // in[x]`
			`MOVOA X6, X2 // low copy`
			`MOVOA X7, X3 // high copy`
			`PSRLQ $4, X1 // X1: high input`
			`PAND X8, X0 // X0: low input`
			`PAND X8, X1 // X0: high input`
			`PSHUFB X0, X2 // X2: mul low part`
			`PSHUFB X1, X3 // X3: mul high part`
			`PXOR X2, X3 // X3: Result`
			`MOVOA X3, (DX) // Store`
			`ADDQ $16, SI // in+=16`
			`ADDQ $16, DX // out+=16`
			`SUBQ $1, R9`
			`JNZ loopback_aligned`
			`JMP done`
Implement XL layer - preliminary work. 9 years ago
			`loopback:`
			`MOVOU (SI), X0 // in[x]`
			`MOVOU X0, X1 // in[x]`
Automatically set goroutines based on shardSize (#5346) Update reedsolomon library to enable feature to automatically set number of go-routines based on the input shard size, since shard size is sort of a constant in Minio for objects > 10MiB (default blocksize) klauspost reported around 15-20% improvement in performance numbers on older systems such as AVX and SSE3 ``` name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10) ``` 7 years ago			`MOVOA X6, X2 // low copy`
			`MOVOA X7, X3 // high copy`
Implement XL layer - preliminary work. 9 years ago			`PSRLQ $4, X1 // X1: high input`
			`PAND X8, X0 // X0: low input`
			`PAND X8, X1 // X0: high input`
			`PSHUFB X0, X2 // X2: mul low part`
			`PSHUFB X1, X3 // X3: mul high part`
			`PXOR X2, X3 // X3: Result`
			`MOVOU X3, (DX) // Store`
			`ADDQ $16, SI // in+=16`
			`ADDQ $16, DX // out+=16`
			`SUBQ $1, R9`
			`JNZ loopback`

			`done:`
			`RET`

			`// func galMulAVX2Xor(low, high, in, out []byte)`
			`TEXT ·galMulAVX2Xor(SB), 7, $0`
			`MOVQ low+0(FP), SI // SI: &low`
			`MOVQ high+24(FP), DX // DX: &high`
			`MOVQ $15, BX // BX: low mask`
			`MOVQ BX, X5`
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`MOVOU (SI), X6 // X6: low`
Implement XL layer - preliminary work. 9 years ago			`MOVOU (DX), X7 // X7: high`
			`MOVQ in_len+56(FP), R9 // R9: len(in)`

Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`VINSERTI128 $1, X6, Y6, Y6 // low`
			`VINSERTI128 $1, X7, Y7, Y7 // high`
			`VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)`
Implement XL layer - preliminary work. 9 years ago
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`SHRQ $5, R9 // len(in) / 32`
Implement XL layer - preliminary work. 9 years ago			`MOVQ out+72(FP), DX // DX: &out`
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`MOVQ in+48(FP), SI // SI: &in`
Implement XL layer - preliminary work. 9 years ago			`TESTQ R9, R9`
			`JZ done_xor_avx2`

			`loopback_xor_avx2:`
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`VMOVDQU (SI), Y0`
			`VMOVDQU (DX), Y4`
			`VPSRLQ $4, Y0, Y1 // Y1: high input`
			`VPAND Y8, Y0, Y0 // Y0: low input`
			`VPAND Y8, Y1, Y1 // Y1: high input`
			`VPSHUFB Y0, Y6, Y2 // Y2: mul low part`
			`VPSHUFB Y1, Y7, Y3 // Y3: mul high part`
			`VPXOR Y3, Y2, Y3 // Y3: Result`
			`VPXOR Y4, Y3, Y4 // Y4: Result`
			`VMOVDQU Y4, (DX)`
Implement XL layer - preliminary work. 9 years ago
			`ADDQ $32, SI // in+=32`
			`ADDQ $32, DX // out+=32`
			`SUBQ $1, R9`
			`JNZ loopback_xor_avx2`

			`done_xor_avx2:`
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`VZEROUPPER`
Implement XL layer - preliminary work. 9 years ago			`RET`

			`// func galMulAVX2(low, high, in, out []byte)`
			`TEXT ·galMulAVX2(SB), 7, $0`
			`MOVQ low+0(FP), SI // SI: &low`
			`MOVQ high+24(FP), DX // DX: &high`
			`MOVQ $15, BX // BX: low mask`
			`MOVQ BX, X5`
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`MOVOU (SI), X6 // X6: low`
Implement XL layer - preliminary work. 9 years ago			`MOVOU (DX), X7 // X7: high`
			`MOVQ in_len+56(FP), R9 // R9: len(in)`

Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`VINSERTI128 $1, X6, Y6, Y6 // low`
			`VINSERTI128 $1, X7, Y7, Y7 // high`
			`VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)`
Implement XL layer - preliminary work. 9 years ago
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`SHRQ $5, R9 // len(in) / 32`
Implement XL layer - preliminary work. 9 years ago			`MOVQ out+72(FP), DX // DX: &out`
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`MOVQ in+48(FP), SI // SI: &in`
Implement XL layer - preliminary work. 9 years ago			`TESTQ R9, R9`
			`JZ done_avx2`

			`loopback_avx2:`
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`VMOVDQU (SI), Y0`
			`VPSRLQ $4, Y0, Y1 // Y1: high input`
			`VPAND Y8, Y0, Y0 // Y0: low input`
			`VPAND Y8, Y1, Y1 // Y1: high input`
			`VPSHUFB Y0, Y6, Y2 // Y2: mul low part`
			`VPSHUFB Y1, Y7, Y3 // Y3: mul high part`
			`VPXOR Y3, Y2, Y4 // Y4: Result`
			`VMOVDQU Y4, (DX)`
Implement XL layer - preliminary work. 9 years ago
			`ADDQ $32, SI // in+=32`
			`ADDQ $32, DX // out+=32`
			`SUBQ $1, R9`
			`JNZ loopback_avx2`

			`done_avx2:`
Updated version of klauspost/reedsolomon using proper AVX2 instructions as well a providing support for Cauchy matrices. (#5215) 7 years ago			`VZEROUPPER`
Implement XL layer - preliminary work. 9 years ago			`RET`
Updated version of klauspost/reedsolomon with NEON support for ARM (#4865) 8 years ago
			`// func sSE2XorSlice(in, out []byte)`
			`TEXT ·sSE2XorSlice(SB), 7, $0`
			`MOVQ in+0(FP), SI // SI: &in`
			`MOVQ in_len+8(FP), R9 // R9: len(in)`
			`MOVQ out+24(FP), DX // DX: &out`
			`SHRQ $4, R9 // len(in) / 16`
			`CMPQ R9, $0`
			`JEQ done_xor_sse2`

			`loopback_xor_sse2:`
			`MOVOU (SI), X0 // in[x]`
			`MOVOU (DX), X1 // out[x]`
			`PXOR X0, X1`
			`MOVOU X1, (DX)`
			`ADDQ $16, SI // in+=16`
			`ADDQ $16, DX // out+=16`
			`SUBQ $1, R9`
			`JNZ loopback_xor_sse2`

			`done_xor_sse2:`
			`RET`