• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Multi-buffer SHA256 algorithm hash compute routine
3 *
4 * This file is provided under a dual BSD/GPLv2 license.  When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 *  Copyright(c) 2016 Intel Corporation.
10 *
11 *  This program is free software; you can redistribute it and/or modify
12 *  it under the terms of version 2 of the GNU General Public License as
13 *  published by the Free Software Foundation.
14 *
15 *  This program is distributed in the hope that it will be useful, but
16 *  WITHOUT ANY WARRANTY; without even the implied warranty of
17 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 *  General Public License for more details.
19 *
20 *  Contact Information:
21 *	Megha Dey <megha.dey@linux.intel.com>
22 *
23 *  BSD LICENSE
24 *
25 *  Copyright(c) 2016 Intel Corporation.
26 *
27 *  Redistribution and use in source and binary forms, with or without
28 *  modification, are permitted provided that the following conditions
29 *  are met:
30 *
31 *    * Redistributions of source code must retain the above copyright
32 *      notice, this list of conditions and the following disclaimer.
33 *    * Redistributions in binary form must reproduce the above copyright
34 *      notice, this list of conditions and the following disclaimer in
35 *      the documentation and/or other materials provided with the
36 *      distribution.
37 *    * Neither the name of Intel Corporation nor the names of its
38 *      contributors may be used to endorse or promote products derived
39 *      from this software without specific prior written permission.
40 *
41 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
42 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
44 *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
45 *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
46 *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
47 *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
48 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
49 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
50 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
51 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
52 */
53
54#include <linux/linkage.h>
55#include "sha256_mb_mgr_datastruct.S"
56
57## code to compute oct SHA256 using SSE-256
58## outer calling routine takes care of save and restore of XMM registers
59## Logic designed/laid out by JDG
60
61## Function clobbers: rax, rcx, rdx,   rbx, rsi, rdi, r9-r15; %ymm0-15
62## Linux clobbers:    rax rbx rcx rdx rsi            r9 r10 r11 r12 r13 r14 r15
63## Linux preserves:                       rdi rbp r8
64##
65## clobbers %ymm0-15
66
67arg1 = %rdi
68arg2 = %rsi
69reg3 = %rcx
70reg4 = %rdx
71
72# Common definitions
73STATE = arg1
74INP_SIZE = arg2
75
76IDX = %rax
77ROUND = %rbx
78TBL = reg3
79
80inp0 = %r9
81inp1 = %r10
82inp2 = %r11
83inp3 = %r12
84inp4 = %r13
85inp5 = %r14
86inp6 = %r15
87inp7 = reg4
88
89a = %ymm0
90b = %ymm1
91c = %ymm2
92d = %ymm3
93e = %ymm4
94f = %ymm5
95g = %ymm6
96h = %ymm7
97
98T1 = %ymm8
99
100a0 = %ymm12
101a1 = %ymm13
102a2 = %ymm14
103TMP = %ymm15
104TMP0 = %ymm6
105TMP1 = %ymm7
106
107TT0 = %ymm8
108TT1 = %ymm9
109TT2 = %ymm10
110TT3 = %ymm11
111TT4 = %ymm12
112TT5 = %ymm13
113TT6 = %ymm14
114TT7 = %ymm15
115
116# Define stack usage
117
118# Assume stack aligned to 32 bytes before call
119# Therefore FRAMESZ mod 32 must be 32-8 = 24
120
121#define FRAMESZ	0x388
122
123#define VMOVPS	vmovups
124
125# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
126# "transpose" data in {r0...r7} using temps {t0...t1}
127# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
128# r0 = {a7 a6 a5 a4   a3 a2 a1 a0}
129# r1 = {b7 b6 b5 b4   b3 b2 b1 b0}
130# r2 = {c7 c6 c5 c4   c3 c2 c1 c0}
131# r3 = {d7 d6 d5 d4   d3 d2 d1 d0}
132# r4 = {e7 e6 e5 e4   e3 e2 e1 e0}
133# r5 = {f7 f6 f5 f4   f3 f2 f1 f0}
134# r6 = {g7 g6 g5 g4   g3 g2 g1 g0}
135# r7 = {h7 h6 h5 h4   h3 h2 h1 h0}
136#
137# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
138# r0 = {h0 g0 f0 e0   d0 c0 b0 a0}
139# r1 = {h1 g1 f1 e1   d1 c1 b1 a1}
140# r2 = {h2 g2 f2 e2   d2 c2 b2 a2}
141# r3 = {h3 g3 f3 e3   d3 c3 b3 a3}
142# r4 = {h4 g4 f4 e4   d4 c4 b4 a4}
143# r5 = {h5 g5 f5 e5   d5 c5 b5 a5}
144# r6 = {h6 g6 f6 e6   d6 c6 b6 a6}
145# r7 = {h7 g7 f7 e7   d7 c7 b7 a7}
146#
147
148.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
149	# process top half (r0..r3) {a...d}
150	vshufps	$0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4   b1 b0 a1 a0}
151	vshufps	$0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6   b3 b2 a3 a2}
152	vshufps	$0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4   d1 d0 c1 c0}
153	vshufps	$0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6   d3 d2 c3 c2}
154	vshufps	$0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5   d1 c1 b1 a1}
155	vshufps	$0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6   d2 c2 b2 a2}
156	vshufps	$0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7   d3 c3 b3 a3}
157	vshufps	$0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4   d0 c0 b0 a0}
158
159	# use r2 in place of t0
160	# process bottom half (r4..r7) {e...h}
161	vshufps	$0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4   f1 f0 e1 e0}
162	vshufps	$0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6   f3 f2 e3 e2}
163	vshufps	$0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4   h1 h0 g1 g0}
164	vshufps	$0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6   h3 h2 g3 g2}
165	vshufps	$0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5   h1 g1 f1 e1}
166	vshufps	$0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6   h2 g2 f2 e2}
167	vshufps	$0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7   h3 g3 f3 e3}
168	vshufps	$0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4   h0 g0 f0 e0}
169
170	vperm2f128	$0x13, \r1, \r5, \r6  # h6...a6
171	vperm2f128	$0x02, \r1, \r5, \r2  # h2...a2
172	vperm2f128	$0x13, \r3, \r7, \r5  # h5...a5
173	vperm2f128	$0x02, \r3, \r7, \r1  # h1...a1
174	vperm2f128	$0x13, \r0, \r4, \r7  # h7...a7
175	vperm2f128	$0x02, \r0, \r4, \r3  # h3...a3
176	vperm2f128	$0x13, \t0, \t1, \r4  # h4...a4
177	vperm2f128	$0x02, \t0, \t1, \r0  # h0...a0
178
179.endm
180
181.macro ROTATE_ARGS
182TMP_ = h
183h = g
184g = f
185f = e
186e = d
187d = c
188c = b
189b = a
190a = TMP_
191.endm
192
193.macro _PRORD reg imm tmp
194	vpslld	$(32-\imm),\reg,\tmp
195	vpsrld	$\imm,\reg, \reg
196	vpor	\tmp,\reg, \reg
197.endm
198
199# PRORD_nd reg, imm, tmp, src
200.macro _PRORD_nd reg imm tmp src
201	vpslld	$(32-\imm), \src, \tmp
202	vpsrld	$\imm, \src, \reg
203	vpor	\tmp, \reg, \reg
204.endm
205
206# PRORD dst/src, amt
207.macro PRORD reg imm
208	_PRORD	\reg,\imm,TMP
209.endm
210
211# PRORD_nd dst, src, amt
212.macro PRORD_nd reg tmp imm
213	_PRORD_nd	\reg, \imm, TMP, \tmp
214.endm
215
216# arguments passed implicitly in preprocessor symbols i, a...h
217.macro ROUND_00_15 _T1 i
218	PRORD_nd	a0,e,5	# sig1: a0 = (e >> 5)
219
220	vpxor	g, f, a2	# ch: a2 = f^g
221	vpand	e,a2, a2	# ch: a2 = (f^g)&e
222	vpxor	g, a2, a2	# a2 = ch
223
224	PRORD_nd	a1,e,25	# sig1: a1 = (e >> 25)
225
226	vmovdqu	\_T1,(SZ8*(\i & 0xf))(%rsp)
227	vpaddd	(TBL,ROUND,1), \_T1, \_T1	# T1 = W + K
228	vpxor	e,a0, a0	# sig1: a0 = e ^ (e >> 5)
229	PRORD	a0, 6		# sig1: a0 = (e >> 6) ^ (e >> 11)
230	vpaddd	a2, h, h	# h = h + ch
231	PRORD_nd	a2,a,11	# sig0: a2 = (a >> 11)
232	vpaddd	\_T1,h, h 	# h = h + ch + W + K
233	vpxor	a1, a0, a0	# a0 = sigma1
234	PRORD_nd	a1,a,22	# sig0: a1 = (a >> 22)
235	vpxor	c, a, \_T1	# maj: T1 = a^c
236	add	$SZ8, ROUND	# ROUND++
237	vpand	b, \_T1, \_T1	# maj: T1 = (a^c)&b
238	vpaddd	a0, h, h
239	vpaddd	h, d, d
240	vpxor	a, a2, a2	# sig0: a2 = a ^ (a >> 11)
241	PRORD	a2,2		# sig0: a2 = (a >> 2) ^ (a >> 13)
242	vpxor	a1, a2, a2	# a2 = sig0
243	vpand	c, a, a1	# maj: a1 = a&c
244	vpor	\_T1, a1, a1 	# a1 = maj
245	vpaddd	a1, h, h	# h = h + ch + W + K + maj
246	vpaddd	a2, h, h	# h = h + ch + W + K + maj + sigma0
247	ROTATE_ARGS
248.endm
249
250# arguments passed implicitly in preprocessor symbols i, a...h
251.macro ROUND_16_XX _T1 i
252	vmovdqu	(SZ8*((\i-15)&0xf))(%rsp), \_T1
253	vmovdqu	(SZ8*((\i-2)&0xf))(%rsp), a1
254	vmovdqu	\_T1, a0
255	PRORD	\_T1,11
256	vmovdqu	a1, a2
257	PRORD	a1,2
258	vpxor	a0, \_T1, \_T1
259	PRORD	\_T1, 7
260	vpxor	a2, a1, a1
261	PRORD	a1, 17
262	vpsrld	$3, a0, a0
263	vpxor	a0, \_T1, \_T1
264	vpsrld	$10, a2, a2
265	vpxor	a2, a1, a1
266	vpaddd	(SZ8*((\i-16)&0xf))(%rsp), \_T1, \_T1
267	vpaddd	(SZ8*((\i-7)&0xf))(%rsp), a1, a1
268	vpaddd	a1, \_T1, \_T1
269
270	ROUND_00_15 \_T1,\i
271.endm
272
273# SHA256_ARGS:
274#   UINT128 digest[8];  // transposed digests
275#   UINT8  *data_ptr[4];
276
277# void sha256_x8_avx2(SHA256_ARGS *args, UINT64 bytes);
278# arg 1 : STATE : pointer to array of pointers to input data
279# arg 2 : INP_SIZE  : size of input in blocks
280	# general registers preserved in outer calling routine
281	# outer calling routine saves all the XMM registers
282	# save rsp, allocate 32-byte aligned for local variables
283ENTRY(sha256_x8_avx2)
284
285	# save callee-saved clobbered registers to comply with C function ABI
286	push    %r12
287	push    %r13
288	push    %r14
289	push    %r15
290
291	mov	%rsp, IDX
292	sub	$FRAMESZ, %rsp
293	and	$~0x1F, %rsp
294	mov	IDX, _rsp(%rsp)
295
296	# Load the pre-transposed incoming digest.
297	vmovdqu	0*SHA256_DIGEST_ROW_SIZE(STATE),a
298	vmovdqu	1*SHA256_DIGEST_ROW_SIZE(STATE),b
299	vmovdqu	2*SHA256_DIGEST_ROW_SIZE(STATE),c
300	vmovdqu	3*SHA256_DIGEST_ROW_SIZE(STATE),d
301	vmovdqu	4*SHA256_DIGEST_ROW_SIZE(STATE),e
302	vmovdqu	5*SHA256_DIGEST_ROW_SIZE(STATE),f
303	vmovdqu	6*SHA256_DIGEST_ROW_SIZE(STATE),g
304	vmovdqu	7*SHA256_DIGEST_ROW_SIZE(STATE),h
305
306	lea	K256_8(%rip),TBL
307
308	# load the address of each of the 4 message lanes
309	# getting ready to transpose input onto stack
310	mov	_args_data_ptr+0*PTR_SZ(STATE),inp0
311	mov	_args_data_ptr+1*PTR_SZ(STATE),inp1
312	mov	_args_data_ptr+2*PTR_SZ(STATE),inp2
313	mov	_args_data_ptr+3*PTR_SZ(STATE),inp3
314	mov	_args_data_ptr+4*PTR_SZ(STATE),inp4
315	mov	_args_data_ptr+5*PTR_SZ(STATE),inp5
316	mov	_args_data_ptr+6*PTR_SZ(STATE),inp6
317	mov	_args_data_ptr+7*PTR_SZ(STATE),inp7
318
319	xor	IDX, IDX
320lloop:
321	xor	ROUND, ROUND
322
323	# save old digest
324	vmovdqu	a, _digest(%rsp)
325	vmovdqu	b, _digest+1*SZ8(%rsp)
326	vmovdqu	c, _digest+2*SZ8(%rsp)
327	vmovdqu	d, _digest+3*SZ8(%rsp)
328	vmovdqu	e, _digest+4*SZ8(%rsp)
329	vmovdqu	f, _digest+5*SZ8(%rsp)
330	vmovdqu	g, _digest+6*SZ8(%rsp)
331	vmovdqu	h, _digest+7*SZ8(%rsp)
332	i = 0
333.rep 2
334	VMOVPS	i*32(inp0, IDX), TT0
335	VMOVPS	i*32(inp1, IDX), TT1
336	VMOVPS	i*32(inp2, IDX), TT2
337	VMOVPS	i*32(inp3, IDX), TT3
338	VMOVPS	i*32(inp4, IDX), TT4
339	VMOVPS	i*32(inp5, IDX), TT5
340	VMOVPS	i*32(inp6, IDX), TT6
341	VMOVPS	i*32(inp7, IDX), TT7
342	vmovdqu	g, _ytmp(%rsp)
343	vmovdqu	h, _ytmp+1*SZ8(%rsp)
344	TRANSPOSE8	TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7,   TMP0, TMP1
345	vmovdqu	PSHUFFLE_BYTE_FLIP_MASK(%rip), TMP1
346	vmovdqu	_ytmp(%rsp), g
347	vpshufb	TMP1, TT0, TT0
348	vpshufb	TMP1, TT1, TT1
349	vpshufb	TMP1, TT2, TT2
350	vpshufb	TMP1, TT3, TT3
351	vpshufb	TMP1, TT4, TT4
352	vpshufb	TMP1, TT5, TT5
353	vpshufb	TMP1, TT6, TT6
354	vpshufb	TMP1, TT7, TT7
355	vmovdqu	_ytmp+1*SZ8(%rsp), h
356	vmovdqu	TT4, _ytmp(%rsp)
357	vmovdqu	TT5, _ytmp+1*SZ8(%rsp)
358	vmovdqu	TT6, _ytmp+2*SZ8(%rsp)
359	vmovdqu	TT7, _ytmp+3*SZ8(%rsp)
360	ROUND_00_15	TT0,(i*8+0)
361	vmovdqu	_ytmp(%rsp), TT0
362	ROUND_00_15	TT1,(i*8+1)
363	vmovdqu	_ytmp+1*SZ8(%rsp), TT1
364	ROUND_00_15	TT2,(i*8+2)
365	vmovdqu	_ytmp+2*SZ8(%rsp), TT2
366	ROUND_00_15	TT3,(i*8+3)
367	vmovdqu	_ytmp+3*SZ8(%rsp), TT3
368	ROUND_00_15	TT0,(i*8+4)
369	ROUND_00_15	TT1,(i*8+5)
370	ROUND_00_15	TT2,(i*8+6)
371	ROUND_00_15	TT3,(i*8+7)
372	i = (i+1)
373.endr
374	add	$64, IDX
375	i = (i*8)
376
377	jmp	Lrounds_16_xx
378.align 16
379Lrounds_16_xx:
380.rep 16
381	ROUND_16_XX	T1, i
382	i = (i+1)
383.endr
384
385	cmp	$ROUNDS,ROUND
386	jb	Lrounds_16_xx
387
388	# add old digest
389	vpaddd	_digest+0*SZ8(%rsp), a, a
390	vpaddd	_digest+1*SZ8(%rsp), b, b
391	vpaddd	_digest+2*SZ8(%rsp), c, c
392	vpaddd	_digest+3*SZ8(%rsp), d, d
393	vpaddd	_digest+4*SZ8(%rsp), e, e
394	vpaddd	_digest+5*SZ8(%rsp), f, f
395	vpaddd	_digest+6*SZ8(%rsp), g, g
396	vpaddd	_digest+7*SZ8(%rsp), h, h
397
398	sub	$1, INP_SIZE  # unit is blocks
399	jne	lloop
400
401	# write back to memory (state object) the transposed digest
402	vmovdqu	a, 0*SHA256_DIGEST_ROW_SIZE(STATE)
403	vmovdqu	b, 1*SHA256_DIGEST_ROW_SIZE(STATE)
404	vmovdqu	c, 2*SHA256_DIGEST_ROW_SIZE(STATE)
405	vmovdqu	d, 3*SHA256_DIGEST_ROW_SIZE(STATE)
406	vmovdqu	e, 4*SHA256_DIGEST_ROW_SIZE(STATE)
407	vmovdqu	f, 5*SHA256_DIGEST_ROW_SIZE(STATE)
408	vmovdqu	g, 6*SHA256_DIGEST_ROW_SIZE(STATE)
409	vmovdqu	h, 7*SHA256_DIGEST_ROW_SIZE(STATE)
410
411	# update input pointers
412	add	IDX, inp0
413	mov	inp0, _args_data_ptr+0*8(STATE)
414	add	IDX, inp1
415	mov	inp1, _args_data_ptr+1*8(STATE)
416	add	IDX, inp2
417	mov	inp2, _args_data_ptr+2*8(STATE)
418	add	IDX, inp3
419	mov	inp3, _args_data_ptr+3*8(STATE)
420	add	IDX, inp4
421	mov	inp4, _args_data_ptr+4*8(STATE)
422	add	IDX, inp5
423	mov	inp5, _args_data_ptr+5*8(STATE)
424	add	IDX, inp6
425	mov	inp6, _args_data_ptr+6*8(STATE)
426	add	IDX, inp7
427	mov	inp7, _args_data_ptr+7*8(STATE)
428
429	# Postamble
430	mov	_rsp(%rsp), %rsp
431
432	# restore callee-saved clobbered registers
433	pop     %r15
434	pop     %r14
435	pop     %r13
436	pop     %r12
437
438	ret
439ENDPROC(sha256_x8_avx2)
440
441.section	.rodata.K256_8, "a", @progbits
442.align 64
443K256_8:
444	.octa	0x428a2f98428a2f98428a2f98428a2f98
445	.octa	0x428a2f98428a2f98428a2f98428a2f98
446	.octa	0x71374491713744917137449171374491
447	.octa	0x71374491713744917137449171374491
448	.octa	0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
449	.octa	0xb5c0fbcfb5c0fbcfb5c0fbcfb5c0fbcf
450	.octa	0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
451	.octa	0xe9b5dba5e9b5dba5e9b5dba5e9b5dba5
452	.octa	0x3956c25b3956c25b3956c25b3956c25b
453	.octa	0x3956c25b3956c25b3956c25b3956c25b
454	.octa	0x59f111f159f111f159f111f159f111f1
455	.octa	0x59f111f159f111f159f111f159f111f1
456	.octa	0x923f82a4923f82a4923f82a4923f82a4
457	.octa	0x923f82a4923f82a4923f82a4923f82a4
458	.octa	0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
459	.octa	0xab1c5ed5ab1c5ed5ab1c5ed5ab1c5ed5
460	.octa	0xd807aa98d807aa98d807aa98d807aa98
461	.octa	0xd807aa98d807aa98d807aa98d807aa98
462	.octa	0x12835b0112835b0112835b0112835b01
463	.octa	0x12835b0112835b0112835b0112835b01
464	.octa	0x243185be243185be243185be243185be
465	.octa	0x243185be243185be243185be243185be
466	.octa	0x550c7dc3550c7dc3550c7dc3550c7dc3
467	.octa	0x550c7dc3550c7dc3550c7dc3550c7dc3
468	.octa	0x72be5d7472be5d7472be5d7472be5d74
469	.octa	0x72be5d7472be5d7472be5d7472be5d74
470	.octa	0x80deb1fe80deb1fe80deb1fe80deb1fe
471	.octa	0x80deb1fe80deb1fe80deb1fe80deb1fe
472	.octa	0x9bdc06a79bdc06a79bdc06a79bdc06a7
473	.octa	0x9bdc06a79bdc06a79bdc06a79bdc06a7
474	.octa	0xc19bf174c19bf174c19bf174c19bf174
475	.octa	0xc19bf174c19bf174c19bf174c19bf174
476	.octa	0xe49b69c1e49b69c1e49b69c1e49b69c1
477	.octa	0xe49b69c1e49b69c1e49b69c1e49b69c1
478	.octa	0xefbe4786efbe4786efbe4786efbe4786
479	.octa	0xefbe4786efbe4786efbe4786efbe4786
480	.octa	0x0fc19dc60fc19dc60fc19dc60fc19dc6
481	.octa	0x0fc19dc60fc19dc60fc19dc60fc19dc6
482	.octa	0x240ca1cc240ca1cc240ca1cc240ca1cc
483	.octa	0x240ca1cc240ca1cc240ca1cc240ca1cc
484	.octa	0x2de92c6f2de92c6f2de92c6f2de92c6f
485	.octa	0x2de92c6f2de92c6f2de92c6f2de92c6f
486	.octa	0x4a7484aa4a7484aa4a7484aa4a7484aa
487	.octa	0x4a7484aa4a7484aa4a7484aa4a7484aa
488	.octa	0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
489	.octa	0x5cb0a9dc5cb0a9dc5cb0a9dc5cb0a9dc
490	.octa	0x76f988da76f988da76f988da76f988da
491	.octa	0x76f988da76f988da76f988da76f988da
492	.octa	0x983e5152983e5152983e5152983e5152
493	.octa	0x983e5152983e5152983e5152983e5152
494	.octa	0xa831c66da831c66da831c66da831c66d
495	.octa	0xa831c66da831c66da831c66da831c66d
496	.octa	0xb00327c8b00327c8b00327c8b00327c8
497	.octa	0xb00327c8b00327c8b00327c8b00327c8
498	.octa	0xbf597fc7bf597fc7bf597fc7bf597fc7
499	.octa	0xbf597fc7bf597fc7bf597fc7bf597fc7
500	.octa	0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
501	.octa	0xc6e00bf3c6e00bf3c6e00bf3c6e00bf3
502	.octa	0xd5a79147d5a79147d5a79147d5a79147
503	.octa	0xd5a79147d5a79147d5a79147d5a79147
504	.octa	0x06ca635106ca635106ca635106ca6351
505	.octa	0x06ca635106ca635106ca635106ca6351
506	.octa	0x14292967142929671429296714292967
507	.octa	0x14292967142929671429296714292967
508	.octa	0x27b70a8527b70a8527b70a8527b70a85
509	.octa	0x27b70a8527b70a8527b70a8527b70a85
510	.octa	0x2e1b21382e1b21382e1b21382e1b2138
511	.octa	0x2e1b21382e1b21382e1b21382e1b2138
512	.octa	0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
513	.octa	0x4d2c6dfc4d2c6dfc4d2c6dfc4d2c6dfc
514	.octa	0x53380d1353380d1353380d1353380d13
515	.octa	0x53380d1353380d1353380d1353380d13
516	.octa	0x650a7354650a7354650a7354650a7354
517	.octa	0x650a7354650a7354650a7354650a7354
518	.octa	0x766a0abb766a0abb766a0abb766a0abb
519	.octa	0x766a0abb766a0abb766a0abb766a0abb
520	.octa	0x81c2c92e81c2c92e81c2c92e81c2c92e
521	.octa	0x81c2c92e81c2c92e81c2c92e81c2c92e
522	.octa	0x92722c8592722c8592722c8592722c85
523	.octa	0x92722c8592722c8592722c8592722c85
524	.octa	0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
525	.octa	0xa2bfe8a1a2bfe8a1a2bfe8a1a2bfe8a1
526	.octa	0xa81a664ba81a664ba81a664ba81a664b
527	.octa	0xa81a664ba81a664ba81a664ba81a664b
528	.octa	0xc24b8b70c24b8b70c24b8b70c24b8b70
529	.octa	0xc24b8b70c24b8b70c24b8b70c24b8b70
530	.octa	0xc76c51a3c76c51a3c76c51a3c76c51a3
531	.octa	0xc76c51a3c76c51a3c76c51a3c76c51a3
532	.octa	0xd192e819d192e819d192e819d192e819
533	.octa	0xd192e819d192e819d192e819d192e819
534	.octa	0xd6990624d6990624d6990624d6990624
535	.octa	0xd6990624d6990624d6990624d6990624
536	.octa	0xf40e3585f40e3585f40e3585f40e3585
537	.octa	0xf40e3585f40e3585f40e3585f40e3585
538	.octa	0x106aa070106aa070106aa070106aa070
539	.octa	0x106aa070106aa070106aa070106aa070
540	.octa	0x19a4c11619a4c11619a4c11619a4c116
541	.octa	0x19a4c11619a4c11619a4c11619a4c116
542	.octa	0x1e376c081e376c081e376c081e376c08
543	.octa	0x1e376c081e376c081e376c081e376c08
544	.octa	0x2748774c2748774c2748774c2748774c
545	.octa	0x2748774c2748774c2748774c2748774c
546	.octa	0x34b0bcb534b0bcb534b0bcb534b0bcb5
547	.octa	0x34b0bcb534b0bcb534b0bcb534b0bcb5
548	.octa	0x391c0cb3391c0cb3391c0cb3391c0cb3
549	.octa	0x391c0cb3391c0cb3391c0cb3391c0cb3
550	.octa	0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
551	.octa	0x4ed8aa4a4ed8aa4a4ed8aa4a4ed8aa4a
552	.octa	0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
553	.octa	0x5b9cca4f5b9cca4f5b9cca4f5b9cca4f
554	.octa	0x682e6ff3682e6ff3682e6ff3682e6ff3
555	.octa	0x682e6ff3682e6ff3682e6ff3682e6ff3
556	.octa	0x748f82ee748f82ee748f82ee748f82ee
557	.octa	0x748f82ee748f82ee748f82ee748f82ee
558	.octa	0x78a5636f78a5636f78a5636f78a5636f
559	.octa	0x78a5636f78a5636f78a5636f78a5636f
560	.octa	0x84c8781484c8781484c8781484c87814
561	.octa	0x84c8781484c8781484c8781484c87814
562	.octa	0x8cc702088cc702088cc702088cc70208
563	.octa	0x8cc702088cc702088cc702088cc70208
564	.octa	0x90befffa90befffa90befffa90befffa
565	.octa	0x90befffa90befffa90befffa90befffa
566	.octa	0xa4506ceba4506ceba4506ceba4506ceb
567	.octa	0xa4506ceba4506ceba4506ceba4506ceb
568	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
569	.octa	0xbef9a3f7bef9a3f7bef9a3f7bef9a3f7
570	.octa	0xc67178f2c67178f2c67178f2c67178f2
571	.octa	0xc67178f2c67178f2c67178f2c67178f2
572
573.section	.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
574.align 32
575PSHUFFLE_BYTE_FLIP_MASK:
576.octa 0x0c0d0e0f08090a0b0405060700010203
577.octa 0x0c0d0e0f08090a0b0405060700010203
578
579.section	.rodata.cst256.K256, "aM", @progbits, 256
580.align 64
581.global K256
582K256:
583	.int	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
584	.int	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
585	.int	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
586	.int	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
587	.int	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
588	.int	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
589	.int	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
590	.int	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
591	.int	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
592	.int	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
593	.int	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
594	.int	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
595	.int	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
596	.int	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
597	.int	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
598	.int	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
599