XL/bitrot: Migrate to new blake2b-simd SIMD optimized implementation. (#2031)
Thanks for Frank Wessels <fwessels@xs4all.nl> for all the heavy lifting work. Comparative benchmarks are as below. ``` benchmark old ns/op new ns/op delta BenchmarkHash64-4 742 411 -44.61% BenchmarkHash128-4 681 346 -49.19% BenchmarkWrite1K-4 4239 1497 -64.69% BenchmarkWrite8K-4 33633 11514 -65.77% BenchmarkWrite32K-4 134091 45947 -65.73% BenchmarkWrite128K-4 537976 183643 -65.86% benchmark old MB/s new MB/s speedup BenchmarkHash64-4 86.18 155.51 1.80x BenchmarkHash128-4 187.96 369.10 1.96x BenchmarkWrite1K-4 241.55 683.87 2.83x BenchmarkWrite8K-4 3897.06 11383.41 2.92x BenchmarkWrite32K-4 977.48 2852.63 2.92x BenchmarkWrite128K-4 243.64 713.73 2.93x ``` Fixes #2030master
parent
796fe165c7
commit
d484157d67
@ -1,23 +0,0 @@ |
||||
Go implementation of BLAKE2b collision-resistant cryptographic hash function |
||||
created by Jean-Philippe Aumasson, Samuel Neves, Zooko Wilcox-O'Hearn, and |
||||
Christian Winnerlein (https://blake2.net). |
||||
|
||||
INSTALLATION |
||||
|
||||
$ go get github.com/dchest/blake2b |
||||
|
||||
|
||||
DOCUMENTATION |
||||
|
||||
See http://godoc.org/github.com/dchest/blake2b |
||||
|
||||
|
||||
PUBLIC DOMAIN DEDICATION |
||||
|
||||
Written in 2012 by Dmitry Chestnykh. |
||||
|
||||
To the extent possible under law, the author have dedicated all copyright |
||||
and related and neighboring rights to this software to the public domain |
||||
worldwide. This software is distributed without any warranty. |
||||
http://creativecommons.org/publicdomain/zero/1.0/ |
||||
|
@ -0,0 +1,202 @@ |
||||
|
||||
Apache License |
||||
Version 2.0, January 2004 |
||||
http://www.apache.org/licenses/ |
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION |
||||
|
||||
1. Definitions. |
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction, |
||||
and distribution as defined by Sections 1 through 9 of this document. |
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by |
||||
the copyright owner that is granting the License. |
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all |
||||
other entities that control, are controlled by, or are under common |
||||
control with that entity. For the purposes of this definition, |
||||
"control" means (i) the power, direct or indirect, to cause the |
||||
direction or management of such entity, whether by contract or |
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the |
||||
outstanding shares, or (iii) beneficial ownership of such entity. |
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity |
||||
exercising permissions granted by this License. |
||||
|
||||
"Source" form shall mean the preferred form for making modifications, |
||||
including but not limited to software source code, documentation |
||||
source, and configuration files. |
||||
|
||||
"Object" form shall mean any form resulting from mechanical |
||||
transformation or translation of a Source form, including but |
||||
not limited to compiled object code, generated documentation, |
||||
and conversions to other media types. |
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or |
||||
Object form, made available under the License, as indicated by a |
||||
copyright notice that is included in or attached to the work |
||||
(an example is provided in the Appendix below). |
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object |
||||
form, that is based on (or derived from) the Work and for which the |
||||
editorial revisions, annotations, elaborations, or other modifications |
||||
represent, as a whole, an original work of authorship. For the purposes |
||||
of this License, Derivative Works shall not include works that remain |
||||
separable from, or merely link (or bind by name) to the interfaces of, |
||||
the Work and Derivative Works thereof. |
||||
|
||||
"Contribution" shall mean any work of authorship, including |
||||
the original version of the Work and any modifications or additions |
||||
to that Work or Derivative Works thereof, that is intentionally |
||||
submitted to Licensor for inclusion in the Work by the copyright owner |
||||
or by an individual or Legal Entity authorized to submit on behalf of |
||||
the copyright owner. For the purposes of this definition, "submitted" |
||||
means any form of electronic, verbal, or written communication sent |
||||
to the Licensor or its representatives, including but not limited to |
||||
communication on electronic mailing lists, source code control systems, |
||||
and issue tracking systems that are managed by, or on behalf of, the |
||||
Licensor for the purpose of discussing and improving the Work, but |
||||
excluding communication that is conspicuously marked or otherwise |
||||
designated in writing by the copyright owner as "Not a Contribution." |
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity |
||||
on behalf of whom a Contribution has been received by Licensor and |
||||
subsequently incorporated within the Work. |
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
copyright license to reproduce, prepare Derivative Works of, |
||||
publicly display, publicly perform, sublicense, and distribute the |
||||
Work and such Derivative Works in Source or Object form. |
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
(except as stated in this section) patent license to make, have made, |
||||
use, offer to sell, sell, import, and otherwise transfer the Work, |
||||
where such license applies only to those patent claims licensable |
||||
by such Contributor that are necessarily infringed by their |
||||
Contribution(s) alone or by combination of their Contribution(s) |
||||
with the Work to which such Contribution(s) was submitted. If You |
||||
institute patent litigation against any entity (including a |
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work |
||||
or a Contribution incorporated within the Work constitutes direct |
||||
or contributory patent infringement, then any patent licenses |
||||
granted to You under this License for that Work shall terminate |
||||
as of the date such litigation is filed. |
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the |
||||
Work or Derivative Works thereof in any medium, with or without |
||||
modifications, and in Source or Object form, provided that You |
||||
meet the following conditions: |
||||
|
||||
(a) You must give any other recipients of the Work or |
||||
Derivative Works a copy of this License; and |
||||
|
||||
(b) You must cause any modified files to carry prominent notices |
||||
stating that You changed the files; and |
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works |
||||
that You distribute, all copyright, patent, trademark, and |
||||
attribution notices from the Source form of the Work, |
||||
excluding those notices that do not pertain to any part of |
||||
the Derivative Works; and |
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its |
||||
distribution, then any Derivative Works that You distribute must |
||||
include a readable copy of the attribution notices contained |
||||
within such NOTICE file, excluding those notices that do not |
||||
pertain to any part of the Derivative Works, in at least one |
||||
of the following places: within a NOTICE text file distributed |
||||
as part of the Derivative Works; within the Source form or |
||||
documentation, if provided along with the Derivative Works; or, |
||||
within a display generated by the Derivative Works, if and |
||||
wherever such third-party notices normally appear. The contents |
||||
of the NOTICE file are for informational purposes only and |
||||
do not modify the License. You may add Your own attribution |
||||
notices within Derivative Works that You distribute, alongside |
||||
or as an addendum to the NOTICE text from the Work, provided |
||||
that such additional attribution notices cannot be construed |
||||
as modifying the License. |
||||
|
||||
You may add Your own copyright statement to Your modifications and |
||||
may provide additional or different license terms and conditions |
||||
for use, reproduction, or distribution of Your modifications, or |
||||
for any such Derivative Works as a whole, provided Your use, |
||||
reproduction, and distribution of the Work otherwise complies with |
||||
the conditions stated in this License. |
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise, |
||||
any Contribution intentionally submitted for inclusion in the Work |
||||
by You to the Licensor shall be under the terms and conditions of |
||||
this License, without any additional terms or conditions. |
||||
Notwithstanding the above, nothing herein shall supersede or modify |
||||
the terms of any separate license agreement you may have executed |
||||
with Licensor regarding such Contributions. |
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade |
||||
names, trademarks, service marks, or product names of the Licensor, |
||||
except as required for reasonable and customary use in describing the |
||||
origin of the Work and reproducing the content of the NOTICE file. |
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or |
||||
agreed to in writing, Licensor provides the Work (and each |
||||
Contributor provides its Contributions) on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
||||
implied, including, without limitation, any warranties or conditions |
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A |
||||
PARTICULAR PURPOSE. You are solely responsible for determining the |
||||
appropriateness of using or redistributing the Work and assume any |
||||
risks associated with Your exercise of permissions under this License. |
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory, |
||||
whether in tort (including negligence), contract, or otherwise, |
||||
unless required by applicable law (such as deliberate and grossly |
||||
negligent acts) or agreed to in writing, shall any Contributor be |
||||
liable to You for damages, including any direct, indirect, special, |
||||
incidental, or consequential damages of any character arising as a |
||||
result of this License or out of the use or inability to use the |
||||
Work (including but not limited to damages for loss of goodwill, |
||||
work stoppage, computer failure or malfunction, or any and all |
||||
other commercial damages or losses), even if such Contributor |
||||
has been advised of the possibility of such damages. |
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing |
||||
the Work or Derivative Works thereof, You may choose to offer, |
||||
and charge a fee for, acceptance of support, warranty, indemnity, |
||||
or other liability obligations and/or rights consistent with this |
||||
License. However, in accepting such obligations, You may act only |
||||
on Your own behalf and on Your sole responsibility, not on behalf |
||||
of any other Contributor, and only if You agree to indemnify, |
||||
defend, and hold each Contributor harmless for any liability |
||||
incurred by, or claims asserted against, such Contributor by reason |
||||
of your accepting any such warranty or additional liability. |
||||
|
||||
END OF TERMS AND CONDITIONS |
||||
|
||||
APPENDIX: How to apply the Apache License to your work. |
||||
|
||||
To apply the Apache License to your work, attach the following |
||||
boilerplate notice, with the fields enclosed by brackets "[]" |
||||
replaced with your own identifying information. (Don't include |
||||
the brackets!) The text should be enclosed in the appropriate |
||||
comment syntax for the file format. We also recommend that a |
||||
file or class name and description of purpose be included on the |
||||
same "printed page" as the copyright notice for easier |
||||
identification within third-party archives. |
||||
|
||||
Copyright [yyyy] [name of copyright owner] |
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); |
||||
you may not use this file except in compliance with the License. |
||||
You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
@ -0,0 +1,20 @@ |
||||
BLAKE2b-SIMD |
||||
============ |
||||
|
||||
Pure Go implementation of BLAKE2b using SIMD optimizations. |
||||
|
||||
Introduction |
||||
------------ |
||||
|
||||
This package is based on the pure go [BLAKE2b](https://github.com/dchest/blake2b) implementation of Dmitry Chestnykh and merges it with the (`cgo` dependent) SSE optimized [BLAKE2](https://github.com/codahale/blake2) implementation (which in turn is based on [official implementation](https://github.com/BLAKE2/BLAKE2). It does so by using [Go's Assembler](https://golang.org/doc/asm) for amd64 architectures with a fallback for other architectures. |
||||
|
||||
It gives roughly a 3x performance improvement over the non-optimized go version. |
||||
|
||||
Benchmarks |
||||
---------- |
||||
|
||||
| Dura | 1 GB | |
||||
| ------------- |:-----:| |
||||
| blake2b-SIMD | 1.59s | |
||||
| blake2b | 4.66s | |
||||
|
@ -0,0 +1,62 @@ |
||||
//+build !noasm
|
||||
//+build !appengine
|
||||
|
||||
/* |
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc. |
||||
* |
||||
* Licensed under the Apache License, Version 2.0 (the "License"); |
||||
* you may not use this file except in compliance with the License. |
||||
* You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package blake2b |
||||
|
||||
//go:noescape
|
||||
func blockAVX(p []uint8, in, iv, t, f, shffle, out []uint64) |
||||
|
||||
func compressAVX(d *digest, p []uint8) { |
||||
h0, h1, h2, h3, h4, h5, h6, h7 := d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] |
||||
|
||||
in := make([]uint64, 8, 8) |
||||
out := make([]uint64, 8, 8) |
||||
|
||||
shffle := make([]uint64, 2, 2) |
||||
// vector for PSHUFB instruction
|
||||
shffle[0] = 0x0201000706050403 |
||||
shffle[1] = 0x0a09080f0e0d0c0b |
||||
|
||||
for len(p) >= BlockSize { |
||||
// Increment counter.
|
||||
d.t[0] += BlockSize |
||||
if d.t[0] < BlockSize { |
||||
d.t[1]++ |
||||
} |
||||
|
||||
in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7] = h0, h1, h2, h3, h4, h5, h6, h7 |
||||
|
||||
blockAVX(p, in, iv[:], d.t[:], d.f[:], shffle, out) |
||||
|
||||
h0, h1, h2, h3, h4, h5, h6, h7 = out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7] |
||||
|
||||
p = p[BlockSize:] |
||||
} |
||||
|
||||
d.h[0], d.h[1], d.h[2], d.h[3], d.h[4], d.h[5], d.h[6], d.h[7] = h0, h1, h2, h3, h4, h5, h6, h7 |
||||
} |
||||
|
||||
func compress(d *digest, p []uint8) { |
||||
// Verifies if AVX is available, use optimized code path.
|
||||
if avx { |
||||
compressAVX(d, p) |
||||
return |
||||
} // else { fallback to generic approach.
|
||||
compressGeneric(d, p) |
||||
} |
@ -0,0 +1,768 @@ |
||||
//+build !noasm !appengine |
||||
|
||||
// |
||||
// Minio Cloud Storage, (C) 2016 Minio, Inc. |
||||
// |
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License. |
||||
// You may obtain a copy of the License at |
||||
// |
||||
// http://www.apache.org/licenses/LICENSE-2.0 |
||||
// |
||||
// Unless required by applicable law or agreed to in writing, software |
||||
// distributed under the License is distributed on an "AS IS" BASIS, |
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
// See the License for the specific language governing permissions and |
||||
// limitations under the License. |
||||
// |
||||
|
||||
// |
||||
// Based on SSE implementation from https://github.com/BLAKE2/BLAKE2/blob/master/sse/blake2b.c |
||||
// |
||||
// Use github.com/fwessels/asm2plan9s on this file to assemble instructions to their Plan9 equivalent |
||||
// |
||||
// Assembly code below essentially follows the ROUND macro (see blake2b-round.h) which is defined as: |
||||
// #define ROUND(r) \ |
||||
// LOAD_MSG_ ##r ##_1(b0, b1); \
|
||||
// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1); \
|
||||
// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1); \
|
||||
// G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1); \
|
||||
// G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
|
||||
// UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
// |
||||
// as well as the go equivalent in https://github.com/dchest/blake2b/blob/master/block.go |
||||
// |
||||
// As in the macro, G1/G2 in the 1st and 2nd half are identical (so literal copy of assembly) |
||||
// |
||||
// Rounds are also the same, except for the loading of the message (and rounds 1 & 11 and |
||||
// rounds 2 & 12 are identical) |
||||
// |
||||
|
||||
#define G1 \ |
||||
\ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
|
||||
BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xf6; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
|
||||
BYTE $0xc5; BYTE $0xf9; BYTE $0x70; BYTE $0xff; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
|
||||
BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
BYTE $0xc4; BYTE $0xc2; BYTE $0x69; BYTE $0x00; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
|
||||
BYTE $0xc4; BYTE $0xc2; BYTE $0x61; BYTE $0x00; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
|
||||
|
||||
#define G2 \ |
||||
\ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x79; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x71; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
|
||||
BYTE $0xc5; BYTE $0xf9; BYTE $0xd4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
|
||||
BYTE $0xc5; BYTE $0xf1; BYTE $0xd4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
|
||||
BYTE $0xc5; BYTE $0xc9; BYTE $0xef; BYTE $0xf0 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
|
||||
BYTE $0xc5; BYTE $0xc1; BYTE $0xef; BYTE $0xf9 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
|
||||
BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
|
||||
BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xf6; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
|
||||
BYTE $0xc5; BYTE $0xfb; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
|
||||
BYTE $0xc5; BYTE $0xfa; BYTE $0x70; BYTE $0xff; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
|
||||
BYTE $0xc5; BYTE $0xd9; BYTE $0xd4; BYTE $0xe6 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
|
||||
BYTE $0xc5; BYTE $0xd1; BYTE $0xd4; BYTE $0xef \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
|
||||
BYTE $0xc5; BYTE $0xe9; BYTE $0xef; BYTE $0xd4 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
|
||||
BYTE $0xc5; BYTE $0xe1; BYTE $0xef; BYTE $0xdd \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
|
||||
BYTE $0xc5; BYTE $0x69; BYTE $0xd4; BYTE $0xfa \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */
|
||||
BYTE $0xc5; BYTE $0xe9; BYTE $0x73; BYTE $0xd2; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0xef; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
|
||||
BYTE $0xc5; BYTE $0x61; BYTE $0xd4; BYTE $0xfb \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */
|
||||
BYTE $0xc5; BYTE $0xe1; BYTE $0x73; BYTE $0xd3; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0xef; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
|
||||
|
||||
#define DIAGONALIZE \ |
||||
\ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
MOVOU X6, X13 \ /* t0 = row4l;\ */
|
||||
MOVOU X2, X14 \ /* t1 = row2l;\ */
|
||||
MOVOU X4, X6 \ /* row4l = row3l;\ */
|
||||
MOVOU X5, X4 \ /* row3l = row3h;\ */
|
||||
MOVOU X6, X5 \ /* row3h = row4l;\ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
|
||||
BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x69; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
|
||||
|
||||
#define UNDIAGONALIZE \ |
||||
\ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
|
||||
MOVOU X4, X13 \ /* t0 = row3l;\ */
|
||||
MOVOU X5, X4 \ /* row3l = row3h;\ */
|
||||
MOVOU X13, X5 \ /* row3h = t0;\ */
|
||||
MOVOU X2, X13 \ /* t0 = row2l;\ */
|
||||
MOVOU X6, X14 \ /* t1 = row4l;\ */
|
||||
BYTE $0xc5; BYTE $0x69; BYTE $0x6c; BYTE $0xfa \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x61; BYTE $0x6d; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
|
||||
BYTE $0xc5; BYTE $0x61; BYTE $0x6c; BYTE $0xfb \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x11; BYTE $0x6d; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
|
||||
BYTE $0xc5; BYTE $0x41; BYTE $0x6c; BYTE $0xff \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x49; BYTE $0x6d; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
|
||||
BYTE $0xc4; BYTE $0xc1; BYTE $0x41; BYTE $0x6d; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
|
||||
|
||||
#define LOAD_SHUFFLE \ |
||||
\ // Load shuffle value |
||||
MOVQ shffle+120(FP), SI \ // SI: &shuffle |
||||
MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a |
||||
|
||||
|
||||
// func blockAVX(p []uint8, in, iv, t, f, shffle, out []uint64) |
||||
TEXT ·blockAVX(SB), 7, $0 |
||||
// REGISTER USE |
||||
// X0 - X7: v0 - v15 |
||||
// X8 - X11: m[0] - m[7] |
||||
// X12: shuffle value |
||||
// X13 - X15: temp registers |
||||
|
||||
// Load digest |
||||
MOVQ in+24(FP), SI // SI: &in |
||||
MOVOU 0(SI), X0 // X0 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */
|
||||
MOVOU 16(SI), X1 // X1 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */
|
||||
MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */
|
||||
MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */
|
||||
|
||||
// Load initialization vector |
||||
MOVQ iv+48(FP), DX // DX: &iv |
||||
MOVOU 0(DX), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */
|
||||
MOVOU 16(DX), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */
|
||||
MOVQ t+72(FP), SI // SI: &t |
||||
MOVOU 32(DX), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */ |
||||
MOVOU 0(SI), X7 // X7 = t[0]+t[1] /* LOAD( &S->t[0] ) */ |
||||
PXOR X7, X6 // X6 = X6 ^ X7 /* row4l = _mm_xor_si128( , ); */
|
||||
MOVQ t+96(FP), SI // SI: &f |
||||
MOVOU 48(DX), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */ |
||||
MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */ |
||||
PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 1 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1] |
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3] |
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5] |
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9] |
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 2 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15] |
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] |
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] |
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11] |
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc; BYTE $0x08// VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] |
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4; BYTE $0x08 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 3 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5] |
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xc5; BYTE $0x08// VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] |
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] |
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 32(DX), X14 // X14 = m[4]+ m[5] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdc; BYTE $0x08// VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 4 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11] |
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] |
||||
MOVOU 112(DX), X14 // X14 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] |
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 5 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] |
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] |
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xd4; BYTE $0x08// VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 6 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] |
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] |
||||
MOVOU 64(DX), X15 // X15 = m[8]+ m[9] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */
|
||||
MOVOU 80(DX), X12 // X12 = m[10]+m[11] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] |
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */
|
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 7 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */
|
||||
MOVOU 80(DX), X12 // X12 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xde; BYTE $0x08// VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] |
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xce; BYTE $0x08// VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */
|
||||
MOVOU 16(DX), X14 // X14 = m[2]+ m[3] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 8 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] |
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd6; BYTE $0x08// VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] |
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 48(DX), X14 // X14 = m[6]+ m[7] |
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 9 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xce; BYTE $0x08// VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */
|
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] |
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xdd; BYTE $0x08// VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 16(DX), X13 // X13 = m[2]+ m[3] |
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11] |
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x09; BYTE $0x0f; BYTE $0xcc; BYTE $0x08// VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */
|
||||
MOVOU 32(DX), X12 // X12 = m[4]+ m[5] |
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 1 0 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] |
||||
MOVOU 80(DX), X15 // X15 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6c; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 32(DX), X14 // X14 = m[4]+ m[5] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 64(DX), X13 // X13 = m[8]+ m[9] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x01; BYTE $0x6d; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */
|
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xd5; BYTE $0x08// VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 1 1 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+m[1] |
||||
MOVOU 16(DX), X13 // X13 = m[2]+m[3] |
||||
MOVOU 32(DX), X14 // X14 = m[4]+m[5] |
||||
MOVOU 48(DX), X15 // X15 = m[6]+m[7] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 64(DX), X12 // X12 = m[8]+ m[9] |
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
MOVOU 112(DX), X15 // X15 = m[14]+m[15] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6d; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
/////////////////////////////////////////////////////////////////////////// |
||||
// R O U N D 1 2 |
||||
/////////////////////////////////////////////////////////////////////////// |
||||
|
||||
// LOAD_MSG_ ##r ##_1(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_2(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 112(DX), X12 // X12 = m[14]+m[15] |
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] |
||||
MOVOU 64(DX), X14 // X14 = m[8]+ m[9] |
||||
MOVOU 96(DX), X15 // X15 = m[12]+m[13] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x19; BYTE $0x6c; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
|
||||
MOVOU 80(DX), X13 // X13 = m[10]+m[11] |
||||
MOVOU 48(DX), X15 // X15 = m[6]+ m[7] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6c; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
|
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x01; BYTE $0x0f; BYTE $0xdc; BYTE $0x08// VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
DIAGONALIZE |
||||
|
||||
// LOAD_MSG_ ##r ##_3(b0, b1);
|
||||
// LOAD_MSG_ ##r ##_4(b0, b1);
|
||||
// (X12 used as additional temp register) |
||||
MOVQ message+0(FP), DX // DX: &p (message) |
||||
MOVOU 0(DX), X12 // X12 = m[0]+ m[1] |
||||
MOVOU 32(DX), X13 // X13 = m[4]+ m[5] |
||||
MOVOU 80(DX), X14 // X14 = m[10]+m[11] |
||||
BYTE $0xc4; BYTE $0x43; BYTE $0x19; BYTE $0x0f; BYTE $0xc4; BYTE $0x08 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6d; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
|
||||
MOVOU 16(DX), X12 // X12 = m[2]+ m[3] |
||||
MOVOU 48(DX), X13 // X13 = m[6]+ m[7] |
||||
MOVOU 96(DX), X14 // X14 = m[12]+m[13] |
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x09; BYTE $0x6c; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
|
||||
BYTE $0xc4; BYTE $0x41; BYTE $0x11; BYTE $0x6d; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
|
||||
|
||||
LOAD_SHUFFLE |
||||
|
||||
G1 |
||||
G2 |
||||
|
||||
UNDIAGONALIZE |
||||
|
||||
// Reload digest |
||||
MOVQ in+24(FP), SI // SI: &in |
||||
MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */
|
||||
MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */
|
||||
MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */
|
||||
MOVOU 48(SI), X15 // X15 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */
|
||||
|
||||
// Final computations and prepare for storing |
||||
PXOR X4, X0 // X0 = X0 ^ X4 /* row1l = _mm_xor_si128( row3l, row1l ); */
|
||||
PXOR X5, X1 // X1 = X1 ^ X5 /* row1h = _mm_xor_si128( row3h, row1h ); */
|
||||
PXOR X12, X0 // X0 = X0 ^ X12 /* STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); */
|
||||
PXOR X13, X1 // X1 = X1 ^ X13 /* STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); */
|
||||
PXOR X6, X2 // X2 = X2 ^ X6 /* row2l = _mm_xor_si128( row4l, row2l ); */
|
||||
PXOR X7, X3 // X3 = X3 ^ X7 /* row2h = _mm_xor_si128( row4h, row2h ); */
|
||||
PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */
|
||||
PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */
|
||||
|
||||
// Store digest |
||||
MOVQ out+144(FP), DX // DX: &out |
||||
MOVOU X0, 0(DX) // out[0]+out[1] = X0 |
||||
MOVOU X1, 16(DX) // out[2]+out[3] = X1 |
||||
MOVOU X2, 32(DX) // out[4]+out[5] = X2 |
||||
MOVOU X3, 48(DX) // out[6]+out[7] = X3 |
||||
|
||||
RET |
||||
|
@ -0,0 +1,14 @@ |
||||
//+build !amd64 noasm appengine
|
||||
|
||||
// Written in 2012 by Dmitry Chestnykh.
|
||||
//
|
||||
// To the extent possible under law, the author have dedicated all copyright
|
||||
// and related and neighboring rights to this software to the public domain
|
||||
// worldwide. This software is distributed without any warranty.
|
||||
// http://creativecommons.org/publicdomain/zero/1.0/
|
||||
|
||||
package blake2b |
||||
|
||||
func compress(d *digest, p []uint8) { |
||||
compressGeneric(d, p) |
||||
} |
@ -0,0 +1,37 @@ |
||||
// +build 386,!gccgo amd64,!gccgo
|
||||
|
||||
// Copyright 2016 Frank Wessels <fwessels@xs4all.nl>
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package blake2b |
||||
|
||||
func cpuid(op uint32) (eax, ebx, ecx, edx uint32) |
||||
func xgetbv(index uint32) (eax, edx uint32) |
||||
|
||||
// True when SIMD instructions are available.
|
||||
var avx = haveAVX() |
||||
|
||||
// haveSSE returns true if we have streaming SIMD instructions.
|
||||
func haveAVX() bool { |
||||
_, _, c, _ := cpuid(1) |
||||
|
||||
// Check XGETBV, OXSAVE and AVX bits
|
||||
if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 { |
||||
// Check for OS support
|
||||
eax, _ := xgetbv(0) |
||||
return (eax & 0x6) == 0x6 |
||||
} |
||||
return false |
||||
} |
@ -0,0 +1,22 @@ |
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. |
||||
|
||||
// +build 386,!gccgo |
||||
|
||||
// func cpuid(op uint32) (eax, ebx, ecx, edx uint32) |
||||
TEXT ·cpuid(SB), 7, $0 |
||||
XORL CX, CX |
||||
MOVL op+0(FP), AX |
||||
CPUID |
||||
MOVL AX, eax+4(FP) |
||||
MOVL BX, ebx+8(FP) |
||||
MOVL CX, ecx+12(FP) |
||||
MOVL DX, edx+16(FP) |
||||
RET |
||||
|
||||
// func xgetbv(index uint32) (eax, edx uint32) |
||||
TEXT ·xgetbv(SB), 7, $0 |
||||
MOVL index+0(FP), CX |
||||
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
|
||||
MOVL AX, eax+4(FP) |
||||
MOVL DX, edx+8(FP) |
||||
RET |
@ -0,0 +1,22 @@ |
||||
// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. |
||||
|
||||
// +build amd64,!gccgo |
||||
|
||||
// func cpuid(op uint32) (eax, ebx, ecx, edx uint32) |
||||
TEXT ·cpuid(SB), 7, $0 |
||||
XORQ CX, CX |
||||
MOVL op+0(FP), AX |
||||
CPUID |
||||
MOVL AX, eax+8(FP) |
||||
MOVL BX, ebx+12(FP) |
||||
MOVL CX, ecx+16(FP) |
||||
MOVL DX, edx+20(FP) |
||||
RET |
||||
|
||||
// func xgetbv(index uint32) (eax, edx uint32) |
||||
TEXT ·xgetbv(SB), 7, $0 |
||||
MOVL index+0(FP), CX |
||||
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
|
||||
MOVL AX, eax+8(FP) |
||||
MOVL DX, edx+12(FP) |
||||
RET |
Loading…
Reference in new issue