From d484157d6706a4955dad77347e261cc4d63f57bb Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Wed, 29 Jun 2016 02:06:35 -0700 Subject: [PATCH] XL/bitrot: Migrate to new blake2b-simd SIMD optimized implementation. (#2031) Thanks for Frank Wessels for all the heavy lifting work. Comparative benchmarks are as below. ``` benchmark old ns/op new ns/op delta BenchmarkHash64-4 742 411 -44.61% BenchmarkHash128-4 681 346 -49.19% BenchmarkWrite1K-4 4239 1497 -64.69% BenchmarkWrite8K-4 33633 11514 -65.77% BenchmarkWrite32K-4 134091 45947 -65.73% BenchmarkWrite128K-4 537976 183643 -65.86% benchmark old MB/s new MB/s speedup BenchmarkHash64-4 86.18 155.51 1.80x BenchmarkHash128-4 187.96 369.10 1.96x BenchmarkWrite1K-4 241.55 683.87 2.83x BenchmarkWrite8K-4 3897.06 11383.41 2.92x BenchmarkWrite32K-4 977.48 2852.63 2.92x BenchmarkWrite128K-4 243.64 713.73 2.93x ``` Fixes #2030 --- erasure-utils.go | 2 +- vendor/github.com/dchest/blake2b/README | 23 - vendor/github.com/minio/blake2b-simd/LICENSE | 202 +++++ .../github.com/minio/blake2b-simd/README.md | 20 + .../blake2b => minio/blake2b-simd}/blake2b.go | 18 +- .../minio/blake2b-simd/compress_amd64.go | 62 ++ .../minio/blake2b-simd/compress_amd64.s | 768 ++++++++++++++++++ .../blake2b-simd/compress_generic.go} | 4 +- .../minio/blake2b-simd/compress_noasm.go | 14 + vendor/github.com/minio/blake2b-simd/cpuid.go | 37 + .../github.com/minio/blake2b-simd/cpuid_386.s | 22 + .../minio/blake2b-simd/cpuid_amd64.s | 22 + vendor/vendor.json | 10 +- 13 files changed, 1164 insertions(+), 40 deletions(-) delete mode 100644 vendor/github.com/dchest/blake2b/README create mode 100644 vendor/github.com/minio/blake2b-simd/LICENSE create mode 100644 vendor/github.com/minio/blake2b-simd/README.md rename vendor/github.com/{dchest/blake2b => minio/blake2b-simd}/blake2b.go (96%) create mode 100644 vendor/github.com/minio/blake2b-simd/compress_amd64.go create mode 100644 vendor/github.com/minio/blake2b-simd/compress_amd64.s rename vendor/github.com/{dchest/blake2b/block.go => minio/blake2b-simd/compress_generic.go} (99%) create mode 100644 vendor/github.com/minio/blake2b-simd/compress_noasm.go create mode 100644 vendor/github.com/minio/blake2b-simd/cpuid.go create mode 100644 vendor/github.com/minio/blake2b-simd/cpuid_386.s create mode 100644 vendor/github.com/minio/blake2b-simd/cpuid_amd64.s diff --git a/erasure-utils.go b/erasure-utils.go index 3882729d4..c72681e17 100644 --- a/erasure-utils.go +++ b/erasure-utils.go @@ -22,8 +22,8 @@ import ( "hash" "io" - "github.com/dchest/blake2b" "github.com/klauspost/reedsolomon" + "github.com/minio/blake2b-simd" ) // newHashWriters - inititialize a slice of hashes for the disk count. diff --git a/vendor/github.com/dchest/blake2b/README b/vendor/github.com/dchest/blake2b/README deleted file mode 100644 index 09f014361..000000000 --- a/vendor/github.com/dchest/blake2b/README +++ /dev/null @@ -1,23 +0,0 @@ -Go implementation of BLAKE2b collision-resistant cryptographic hash function -created by Jean-Philippe Aumasson, Samuel Neves, Zooko Wilcox-O'Hearn, and -Christian Winnerlein (https://blake2.net). - -INSTALLATION - - $ go get github.com/dchest/blake2b - - -DOCUMENTATION - - See http://godoc.org/github.com/dchest/blake2b - - -PUBLIC DOMAIN DEDICATION - -Written in 2012 by Dmitry Chestnykh. - -To the extent possible under law, the author have dedicated all copyright -and related and neighboring rights to this software to the public domain -worldwide. This software is distributed without any warranty. -http://creativecommons.org/publicdomain/zero/1.0/ - diff --git a/vendor/github.com/minio/blake2b-simd/LICENSE b/vendor/github.com/minio/blake2b-simd/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/vendor/github.com/minio/blake2b-simd/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/github.com/minio/blake2b-simd/README.md b/vendor/github.com/minio/blake2b-simd/README.md new file mode 100644 index 000000000..86993a45c --- /dev/null +++ b/vendor/github.com/minio/blake2b-simd/README.md @@ -0,0 +1,20 @@ +BLAKE2b-SIMD +============ + +Pure Go implementation of BLAKE2b using SIMD optimizations. + +Introduction +------------ + +This package is based on the pure go [BLAKE2b](https://github.com/dchest/blake2b) implementation of Dmitry Chestnykh and merges it with the (`cgo` dependent) SSE optimized [BLAKE2](https://github.com/codahale/blake2) implementation (which in turn is based on [official implementation](https://github.com/BLAKE2/BLAKE2). It does so by using [Go's Assembler](https://golang.org/doc/asm) for amd64 architectures with a fallback for other architectures. + +It gives roughly a 3x performance improvement over the non-optimized go version. + +Benchmarks +---------- + +| Dura | 1 GB | +| ------------- |:-----:| +| blake2b-SIMD | 1.59s | +| blake2b | 4.66s | + diff --git a/vendor/github.com/dchest/blake2b/blake2b.go b/vendor/github.com/minio/blake2b-simd/blake2b.go similarity index 96% rename from vendor/github.com/dchest/blake2b/blake2b.go rename to vendor/github.com/minio/blake2b-simd/blake2b.go index f3eb38872..538466a1a 100644 --- a/vendor/github.com/dchest/blake2b/blake2b.go +++ b/vendor/github.com/minio/blake2b-simd/blake2b.go @@ -143,6 +143,7 @@ func (d *digest) initialize(c *Config) { p[2] = 1 p[3] = 1 } + // Initialize. d.size = c.Size for i := 0; i < 8; i++ { @@ -151,6 +152,7 @@ func (d *digest) initialize(c *Config) { if c.Tree != nil && c.Tree.IsLastNode { d.isLastNode = true } + // Process key. if c.Key != nil { copy(d.paddedKey[:], c.Key) @@ -213,7 +215,7 @@ func (d *digest) Write(p []byte) (nn int, err error) { // Process buffer. copy(d.x[d.nx:], p[:left]) p = p[left:] - blocks(d, d.x[:]) + compress(d, d.x[:]) d.nx = 0 } // Process full blocks except for the last one. @@ -222,7 +224,7 @@ func (d *digest) Write(p []byte) (nn int, err error) { if n == len(p) { n -= BlockSize } - blocks(d, p[:n]) + compress(d, p[:n]) p = p[n:] } // Fill buffer. @@ -231,11 +233,11 @@ func (d *digest) Write(p []byte) (nn int, err error) { } // Sum returns the calculated checksum. -func (d0 *digest) Sum(in []byte) []byte { - // Make a copy of d0 so that caller can keep writing and summing. - d := *d0 - hash := d.checkSum() - return append(in, hash[:d.size]...) +func (d *digest) Sum(in []byte) []byte { + // Make a copy of d so that caller can keep writing and summing. + d0 := *d + hash := d0.checkSum() + return append(in, hash[:d0.size]...) } func (d *digest) checkSum() [Size]byte { @@ -262,7 +264,7 @@ func (d *digest) checkSum() [Size]byte { d.f[1] = 0xffffffffffffffff } // Compress last block. - blocks(d, d.x[:]) + compress(d, d.x[:]) var out [Size]byte j := 0 diff --git a/vendor/github.com/minio/blake2b-simd/compress_amd64.go b/vendor/github.com/minio/blake2b-simd/compress_amd64.go new file mode 100644 index 000000000..0a5299e01 --- /dev/null +++ b/vendor/github.com/minio/blake2b-simd/compress_amd64.go @@ -0,0 +1,62 @@ +//+build !noasm +//+build !appengine + +/* + * Minio Cloud Storage, (C) 2016 Minio, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package blake2b + +//go:noescape +func blockAVX(p []uint8, in, iv, t, f, shffle, out []uint64) + +func compressAVX(d *digest, p []uint8) { + h0, h1, h2, h3, h4, h5, h6, h7 := d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] + + in := make([]uint64, 8, 8) + out := make([]uint64, 8, 8) + + shffle := make([]uint64, 2, 2) + // vector for PSHUFB instruction + shffle[0] = 0x0201000706050403 + shffle[1] = 0x0a09080f0e0d0c0b + + for len(p) >= BlockSize { + // Increment counter. + d.t[0] += BlockSize + if d.t[0] < BlockSize { + d.t[1]++ + } + + in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = h0, h1, h2, h3, h4, h5, h6, h7 + + blockAVX(p, in, iv[:], d.t[:], d.f[:], shffle, out) + + h0, h1, h2, h3, h4, h5, h6, h7 = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7] + + p = p[BlockSize:] + } + + d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = h0, h1, h2, h3, h4, h5, h6, h7 +} + +func compress(d *digest, p []uint8) { + // Verifies if AVX is available, use optimized code path. + if avx { + compressAVX(d, p) + return + } // else { fallback to generic approach. + compressGeneric(d, p) +} diff --git a/vendor/github.com/minio/blake2b-simd/compress_amd64.s b/vendor/github.com/minio/blake2b-simd/compress_amd64.s new file mode 100644 index 000000000..beb142952 --- /dev/null +++ b/vendor/github.com/minio/blake2b-simd/compress_amd64.s @@ -0,0 +1,768 @@ +//+build !noasm !appengine + +// +// Minio Cloud Storage, (C) 2016 Minio, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// +// Based on SSE implementation from https://github.com/BLAKE2/BLAKE2/blob/master/sse/blake2b.c +// +// Use github.com/fwessels/asm2plan9s on this file to assemble instructions to their Plan9 equivalent +// +// Assembly code below essentially follows the ROUND macro (see blake2b-round.h) which is defined as: +// #define ROUND(r) \ +// LOAD_MSG_ ##r ##_1(b0, b1); \ +// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ +// LOAD_MSG_ ##r ##_2(b0, b1); \ +// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ +// DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ +// LOAD_MSG_ ##r ##_3(b0, b1); \ +// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ +// LOAD_MSG_ ##r ##_4(b0, b1); \ +// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ +// UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); +// +// as well as the go equivalent in https://github.com/dchest/blake2b/blob/master/block.go +// +// As in the macro, G1/G2 in the 1st and 2nd half are identical (so literal copy of assembly) +// +// Rounds are also the same, except for the loading of the message (and rounds 1 & 11 and +// rounds 2 & 12 are identical) +// + +#define G1 \ + \ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); + BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */ + BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */ + BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */ + BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ + BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ + BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xf6; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */ + BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xff; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */ + BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */ + BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */ + BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ + BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ + BYTE $0xc4; BYTE $0xc2; BYTE $0x69; BYTE $0x00; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */ + BYTE $0xc4; BYTE $0xc2; BYTE $0x61; BYTE $0x00; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */ + +#define G2 \ + \ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); + BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */ + BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */ + BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */ + BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */ + BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */ + BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */ + BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */ + BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */ + BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */ + BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */ + BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */ + BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */ + BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */ + BYTE $0xc5; BYTE $0x69; BYTE $0xd4; BYTE $0xfa \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */ + BYTE $0xc5; BYTE $0xe9; BYTE $0x73; BYTE $0xd2; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0xef; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */ + BYTE $0xc5; BYTE $0x61; BYTE $0xd4; BYTE $0xfb \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */ + BYTE $0xc5; BYTE $0xe1; BYTE $0x73; BYTE $0xd3; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0xef; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */ + +#define DIAGONALIZE \ + \ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + MOVOU X6, X13 \ /* t0 = row4l;\ */ + MOVOU X2, X14 \ /* t1 = row2l;\ */ + MOVOU X4, X6 \ /* row4l = row3l;\ */ + MOVOU X5, X4 \ /* row3l = row3h;\ */ + MOVOU X6, X5 \ /* row3h = row4l;\ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */ + BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */ + BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */ + +#define UNDIAGONALIZE \ + \ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + MOVOU X4, X13 \ /* t0 = row3l;\ */ + MOVOU X5, X4 \ /* row3l = row3h;\ */ + MOVOU X13, X5 \ /* row3h = t0;\ */ + MOVOU X2, X13 \ /* t0 = row2l;\ */ + MOVOU X6, X14 \ /* t1 = row4l;\ */ + BYTE $0xc5; BYTE $0x69; BYTE $0x6c; BYTE $0xfa \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */ + BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */ + BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x49; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */ + BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */ + +#define LOAD_SHUFFLE \ + \ // Load shuffle value + MOVQ shffle+120(FP), SI \ // SI: &shuffle + MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a + + +// func blockAVX(p []uint8, in, iv, t, f, shffle, out []uint64) +TEXT ·blockAVX(SB), 7, $0 + // REGISTER USE + // X0 - X7: v0 - v15 + // X8 - X11: m[0] - m[7] + // X12: shuffle value + // X13 - X15: temp registers + + // Load digest + MOVQ in+24(FP), SI // SI: &in + MOVOU 0(SI), X0 // X0 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ + MOVOU 16(SI), X1 // X1 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ + MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ + MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ + + // Load initialization vector + MOVQ iv+48(FP), DX // DX: &iv + MOVOU 0(DX), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */ + MOVOU 16(DX), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */ + MOVQ t+72(FP), SI // SI: &t + MOVOU 32(DX), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */ + MOVOU 0(SI), X7 // X7 = t[0]+t[1] /* LOAD( &S->t[0] ) */ + PXOR X7, X6 // X6 = X6 ^ X7 /* row4l = _mm_xor_si128( , ); */ + MOVQ t+96(FP), SI // SI: &f + MOVOU 48(DX), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */ + MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */ + PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */ + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+m[1] + MOVOU 16(DX), X13 // X13 = m[2]+m[3] + MOVOU 32(DX), X14 // X14 = m[4]+m[5] + MOVOU 48(DX), X15 // X15 = m[6]+m[7] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 64(DX), X12 // X12 = m[8]+ m[9] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 2 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 112(DX), X12 // X12 = m[14]+m[15] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */ + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 48(DX), X15 // X15 = m[6]+ m[7] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */ + BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc; BYTE $0x08// VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4; BYTE $0x08 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */ + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 3 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 32(DX), X12 // X12 = m[4]+ m[5] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xc5; BYTE $0x08// VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 64(DX), X15 // X15 = m[8]+ m[9] + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X14 // X14 = m[4]+ m[5] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */ + BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdc; BYTE $0x08// VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 4 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 112(DX), X14 // X14 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X15 // X15 = m[8]+ m[9] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 5 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xd4; BYTE $0x08// VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 6 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 64(DX), X15 // X15 = m[8]+ m[9] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */ + MOVOU 80(DX), X12 // X12 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */ + MOVOU 64(DX), X12 // X12 = m[8]+ m[9] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 7 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */ + MOVOU 80(DX), X12 // X12 = m[10]+m[11] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */ + BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xde; BYTE $0x08// VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */ + BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xce; BYTE $0x08// VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */ + MOVOU 16(DX), X14 // X14 = m[2]+ m[3] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 8 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd6; BYTE $0x08// VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X14 // X14 = m[6]+ m[7] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 9 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */ + BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xce; BYTE $0x08// VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */ + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */ + BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdd; BYTE $0x08// VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 16(DX), X13 // X13 = m[2]+ m[3] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */ + BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xcc; BYTE $0x08// VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */ + MOVOU 32(DX), X12 // X12 = m[4]+ m[5] + MOVOU 48(DX), X15 // X15 = m[6]+ m[7] + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 0 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 80(DX), X15 // X15 = m[10]+m[11] + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */ + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 32(DX), X14 // X14 = m[4]+ m[5] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 64(DX), X13 // X13 = m[8]+ m[9] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */ + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd5; BYTE $0x08// VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 1 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+m[1] + MOVOU 16(DX), X13 // X13 = m[2]+m[3] + MOVOU 32(DX), X14 // X14 = m[4]+m[5] + MOVOU 48(DX), X15 // X15 = m[6]+m[7] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 64(DX), X12 // X12 = m[8]+ m[9] + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + MOVOU 112(DX), X15 // X15 = m[14]+m[15] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + /////////////////////////////////////////////////////////////////////////// + // R O U N D 1 2 + /////////////////////////////////////////////////////////////////////////// + + // LOAD_MSG_ ##r ##_1(b0, b1); + // LOAD_MSG_ ##r ##_2(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 112(DX), X12 // X12 = m[14]+m[15] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 64(DX), X14 // X14 = m[8]+ m[9] + MOVOU 96(DX), X15 // X15 = m[12]+m[13] + BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */ + MOVOU 80(DX), X13 // X13 = m[10]+m[11] + MOVOU 48(DX), X15 // X15 = m[6]+ m[7] + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */ + BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc; BYTE $0x08// VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */ + + LOAD_SHUFFLE + + G1 + G2 + + DIAGONALIZE + + // LOAD_MSG_ ##r ##_3(b0, b1); + // LOAD_MSG_ ##r ##_4(b0, b1); + // (X12 used as additional temp register) + MOVQ message+0(FP), DX // DX: &p (message) + MOVOU 0(DX), X12 // X12 = m[0]+ m[1] + MOVOU 32(DX), X13 // X13 = m[4]+ m[5] + MOVOU 80(DX), X14 // X14 = m[10]+m[11] + BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4; BYTE $0x08 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */ + MOVOU 16(DX), X12 // X12 = m[2]+ m[3] + MOVOU 48(DX), X13 // X13 = m[6]+ m[7] + MOVOU 96(DX), X14 // X14 = m[12]+m[13] + BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */ + BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */ + + LOAD_SHUFFLE + + G1 + G2 + + UNDIAGONALIZE + + // Reload digest + MOVQ in+24(FP), SI // SI: &in + MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */ + MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */ + MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */ + MOVOU 48(SI), X15 // X15 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */ + + // Final computations and prepare for storing + PXOR X4, X0 // X0 = X0 ^ X4 /* row1l = _mm_xor_si128( row3l, row1l ); */ + PXOR X5, X1 // X1 = X1 ^ X5 /* row1h = _mm_xor_si128( row3h, row1h ); */ + PXOR X12, X0 // X0 = X0 ^ X12 /* STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); */ + PXOR X13, X1 // X1 = X1 ^ X13 /* STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); */ + PXOR X6, X2 // X2 = X2 ^ X6 /* row2l = _mm_xor_si128( row4l, row2l ); */ + PXOR X7, X3 // X3 = X3 ^ X7 /* row2h = _mm_xor_si128( row4h, row2h ); */ + PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */ + PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */ + + // Store digest + MOVQ out+144(FP), DX // DX: &out + MOVOU X0, 0(DX) // out[0]+out[1] = X0 + MOVOU X1, 16(DX) // out[2]+out[3] = X1 + MOVOU X2, 32(DX) // out[4]+out[5] = X2 + MOVOU X3, 48(DX) // out[6]+out[7] = X3 + + RET + diff --git a/vendor/github.com/dchest/blake2b/block.go b/vendor/github.com/minio/blake2b-simd/compress_generic.go similarity index 99% rename from vendor/github.com/dchest/blake2b/block.go rename to vendor/github.com/minio/blake2b-simd/compress_generic.go index 2f04bb78b..62d81aaab 100644 --- a/vendor/github.com/dchest/blake2b/block.go +++ b/vendor/github.com/minio/blake2b-simd/compress_generic.go @@ -5,11 +5,9 @@ // worldwide. This software is distributed without any warranty. // http://creativecommons.org/publicdomain/zero/1.0/ -// BLAKE2b compression of message blocks. - package blake2b -func blocks(d *digest, p []uint8) { +func compressGeneric(d *digest, p []uint8) { h0, h1, h2, h3, h4, h5, h6, h7 := d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] for len(p) >= BlockSize { diff --git a/vendor/github.com/minio/blake2b-simd/compress_noasm.go b/vendor/github.com/minio/blake2b-simd/compress_noasm.go new file mode 100644 index 000000000..7c198dd9f --- /dev/null +++ b/vendor/github.com/minio/blake2b-simd/compress_noasm.go @@ -0,0 +1,14 @@ +//+build !amd64 noasm appengine + +// Written in 2012 by Dmitry Chestnykh. +// +// To the extent possible under law, the author have dedicated all copyright +// and related and neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// http://creativecommons.org/publicdomain/zero/1.0/ + +package blake2b + +func compress(d *digest, p []uint8) { + compressGeneric(d, p) +} diff --git a/vendor/github.com/minio/blake2b-simd/cpuid.go b/vendor/github.com/minio/blake2b-simd/cpuid.go new file mode 100644 index 000000000..696e064fa --- /dev/null +++ b/vendor/github.com/minio/blake2b-simd/cpuid.go @@ -0,0 +1,37 @@ +// +build 386,!gccgo amd64,!gccgo + +// Copyright 2016 Frank Wessels +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package blake2b + +func cpuid(op uint32) (eax, ebx, ecx, edx uint32) +func xgetbv(index uint32) (eax, edx uint32) + +// True when SIMD instructions are available. +var avx = haveAVX() + +// haveSSE returns true if we have streaming SIMD instructions. +func haveAVX() bool { + _, _, c, _ := cpuid(1) + + // Check XGETBV, OXSAVE and AVX bits + if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 { + // Check for OS support + eax, _ := xgetbv(0) + return (eax & 0x6) == 0x6 + } + return false +} diff --git a/vendor/github.com/minio/blake2b-simd/cpuid_386.s b/vendor/github.com/minio/blake2b-simd/cpuid_386.s new file mode 100644 index 000000000..9ae606e81 --- /dev/null +++ b/vendor/github.com/minio/blake2b-simd/cpuid_386.s @@ -0,0 +1,22 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// +build 386,!gccgo + +// func cpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·cpuid(SB), 7, $0 + XORL CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+4(FP) + MOVL BX, ebx+8(FP) + MOVL CX, ecx+12(FP) + MOVL DX, edx+16(FP) + RET + +// func xgetbv(index uint32) (eax, edx uint32) +TEXT ·xgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+4(FP) + MOVL DX, edx+8(FP) + RET diff --git a/vendor/github.com/minio/blake2b-simd/cpuid_amd64.s b/vendor/github.com/minio/blake2b-simd/cpuid_amd64.s new file mode 100644 index 000000000..27aad2a10 --- /dev/null +++ b/vendor/github.com/minio/blake2b-simd/cpuid_amd64.s @@ -0,0 +1,22 @@ +// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. + +// +build amd64,!gccgo + +// func cpuid(op uint32) (eax, ebx, ecx, edx uint32) +TEXT ·cpuid(SB), 7, $0 + XORQ CX, CX + MOVL op+0(FP), AX + CPUID + MOVL AX, eax+8(FP) + MOVL BX, ebx+12(FP) + MOVL CX, ecx+16(FP) + MOVL DX, edx+20(FP) + RET + +// func xgetbv(index uint32) (eax, edx uint32) +TEXT ·xgetbv(SB), 7, $0 + MOVL index+0(FP), CX + BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV + MOVL AX, eax+8(FP) + MOVL DX, edx+12(FP) + RET diff --git a/vendor/vendor.json b/vendor/vendor.json index e139499e5..d88f1e587 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -12,11 +12,6 @@ "revision": "c1f48d5ce4f292dfb775ef52aaedd15be323510d", "revisionTime": "2016-05-20T13:10:51+03:00" }, - { - "path": "github.com/dchest/blake2b", - "revision": "3c8c640cd7bea3ca78209d812b5854442ab92fed", - "revisionTime": "2015-10-22T12:35:02+02:00" - }, { "path": "github.com/dgrijalva/jwt-go", "revision": "afef698c326bfd906b11659432544e5aae441d44", @@ -87,6 +82,11 @@ "revision": "7fcbc72f853b92b5720db4a6b8482be612daef24", "revisionTime": "2015-08-14T09:26:29+09:00" }, + { + "path": "github.com/minio/blake2b-simd", + "revision": "0b3e695ecc77a334fafe30ee36de504c41ec4d6a", + "revisionTime": "2016-06-28T02:55:56-07:00" + }, { "path": "github.com/minio/cli", "revision": "c4a07c7b68db77ccd119183fb1d01dd5972434ab",