• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_SM4
18
19#include "crypt_arm.h"
20.arch	armv8-a+crypto
21
22rk0       .req  v12
23rk1       .req  v13
24rka       .req  v14
25rkb       .req  v15
26rk2       .req  v20
27rkc       .req  v21
28
29vtmp0     .req  v0
30vtmp1     .req  v1
31vtmp2     .req  v2
32vtmp3     .req  v3
33
34vtmp4     .req  v24
35vtmp5     .req  v25
36vtmp6     .req  v22
37vtmp7     .req  v23
38
39data0     .req  v4
40data1     .req  v5
41data2     .req  v6
42data3     .req  v7
43
44datax0     .req  v8
45datax1     .req  v9
46datax2     .req  v10
47datax3     .req  v11
48
49vtmpx0     .req  v12
50vtmpx1     .req  v13
51vtmpx2     .req  v14
52vtmpx3     .req  v15
53
54data10     .req  v16
55data11     .req  v17
56data12     .req  v18
57data13     .req  v19
58
59MaskV     .req  v26
60TAHMatV   .req  v27
61TALMatV   .req  v28
62ATAHMatV  .req  v29
63ATALMatV  .req  v30
64ANDMaskV  .req  v31
65
66MaskQ      .req q26
67TAHMatQ    .req q27
68TALMatQ    .req q28
69ATAHMatQ   .req q29
70ATALMatQ   .req q30
71ANDMaskQ   .req q31
72vtmp5q     .req q25
73vtmp6q     .req q22
74vtmp7q     .req q23
75
76inp      .req   x0
77outp     .req   x1
78blocks   .req   w2
79rks      .req   x3
80
81wtmp0   .req    w7
82wtmp1   .req    w8
83wtmp2   .req    w9
84
85ptr     .req    x10
86counter .req    w11
87
88word0   .req    w12
89word1   .req    w13
90word2   .req    w14
91word3   .req    w15
92
93xword1  .req    x13
94tbox0   .req    x19
95tbox1   .req    x20
96tbox2   .req    x21
97tbox3   .req    x22
98
99len     .req   x2
100ivp     .req   x4
101ctr     .req   w5
102ivec    .req   v3
103ivec1   .req   v15
104
105.section .rodata
106.align 4
107.Ltbox1:
108.word	0xd55b5b8e, 0x924242d0, 0xeaa7a74d, 0xfdfbfb06, 0xcf3333fc, 0xe2878765, 0x3df4f4c9, 0xb5dede6b, 0x1658584e
109.word	0xb4dada6e, 0x14505044, 0xc10b0bca, 0x28a0a088, 0xf8efef17, 0x2cb0b09c, 0x05141411, 0x2bacac87, 0x669d9dfb
110.word	0x986a6af2, 0x77d9d9ae, 0x2aa8a882, 0xbcfafa46, 0x04101014, 0xc00f0fcf, 0xa8aaaa02, 0x45111154, 0x134c4c5f
111.word	0x269898be, 0x4825256d, 0x841a1a9e, 0x0618181e, 0x9b6666fd, 0x9e7272ec, 0x4309094a, 0x51414110, 0xf7d3d324
112.word	0x934646d5, 0xecbfbf53, 0x9a6262f8, 0x7be9e992, 0x33ccccff, 0x55515104, 0x0b2c2c27, 0x420d0d4f, 0xeeb7b759
113.word	0xcc3f3ff3, 0xaeb2b21c, 0x638989ea, 0xe7939374, 0xb1cece7f, 0x1c70706c, 0xaba6a60d, 0xca2727ed, 0x08202028
114.word	0xeba3a348, 0x975656c1, 0x82020280, 0xdc7f7fa3, 0x965252c4, 0xf9ebeb12, 0x74d5d5a1, 0x8d3e3eb3, 0x3ffcfcc3
115.word	0xa49a9a3e, 0x461d1d5b, 0x071c1c1b, 0xa59e9e3b, 0xfff3f30c, 0xf0cfcf3f, 0x72cdcdbf, 0x175c5c4b, 0xb8eaea52
116.word	0x810e0e8f, 0x5865653d, 0x3cf0f0cc, 0x1964647d, 0xe59b9b7e, 0x87161691, 0x4e3d3d73, 0xaaa2a208, 0x69a1a1c8
117.word	0x6aadadc7, 0x83060685, 0xb0caca7a, 0x70c5c5b5, 0x659191f4, 0xd96b6bb2, 0x892e2ea7, 0xfbe3e318, 0xe8afaf47
118.word	0x0f3c3c33, 0x4a2d2d67, 0x71c1c1b0, 0x5759590e, 0x9f7676e9, 0x35d4d4e1, 0x1e787866, 0x249090b4, 0x0e383836
119.word	0x5f797926, 0x628d8def, 0x59616138, 0xd2474795, 0xa08a8a2a, 0x259494b1, 0x228888aa, 0x7df1f18c, 0x3bececd7
120.word	0x01040405, 0x218484a5, 0x79e1e198, 0x851e1e9b, 0xd7535384, 0x00000000, 0x4719195e, 0x565d5d0b, 0x9d7e7ee3
121.word	0xd04f4f9f, 0x279c9cbb, 0x5349491a, 0x4d31317c, 0x36d8d8ee, 0x0208080a, 0xe49f9f7b, 0xa2828220, 0xc71313d4
122.word	0xcb2323e8, 0x9c7a7ae6, 0xe9abab42, 0xbdfefe43, 0x882a2aa2, 0xd14b4b9a, 0x41010140, 0xc41f1fdb, 0x38e0e0d8
123.word	0xb7d6d661, 0xa18e8e2f, 0xf4dfdf2b, 0xf1cbcb3a, 0xcd3b3bf6, 0xfae7e71d, 0x608585e5, 0x15545441, 0xa3868625
124.word	0xe3838360, 0xacbaba16, 0x5c757529, 0xa6929234, 0x996e6ef7, 0x34d0d0e4, 0x1a686872, 0x54555501, 0xafb6b619
125.word	0x914e4edf, 0x32c8c8fa, 0x30c0c0f0, 0xf6d7d721, 0x8e3232bc, 0xb3c6c675, 0xe08f8f6f, 0x1d747469, 0xf5dbdb2e
126.word	0xe18b8b6a, 0x2eb8b896, 0x800a0a8a, 0x679999fe, 0xc92b2be2, 0x618181e0, 0xc30303c0, 0x29a4a48d, 0x238c8caf
127.word	0xa9aeae07, 0x0d343439, 0x524d4d1f, 0x4f393976, 0x6ebdbdd3, 0xd6575781, 0xd86f6fb7, 0x37dcdceb, 0x44151551
128.word	0xdd7b7ba6, 0xfef7f709, 0x8c3a3ab6, 0x2fbcbc93, 0x030c0c0f, 0xfcffff03, 0x6ba9a9c2, 0x73c9c9ba, 0x6cb5b5d9
129.word	0x6db1b1dc, 0x5a6d6d37, 0x50454515, 0x8f3636b9, 0x1b6c6c77, 0xadbebe13, 0x904a4ada, 0xb9eeee57, 0xde7777a9
130.word	0xbef2f24c, 0x7efdfd83, 0x11444455, 0xda6767bd, 0x5d71712c, 0x40050545, 0x1f7c7c63, 0x10404050, 0x5b696932
131.word	0xdb6363b8, 0x0a282822, 0xc20707c5, 0x31c4c4f5, 0x8a2222a8, 0xa7969631, 0xce3737f9, 0x7aeded97, 0xbff6f649
132.word	0x2db4b499, 0x75d1d1a4, 0xd3434390, 0x1248485a, 0xbae2e258, 0xe6979771, 0xb6d2d264, 0xb2c2c270, 0x8b2626ad
133.word	0x68a5a5cd, 0x955e5ecb, 0x4b292962, 0x0c30303c, 0x945a5ace, 0x76ddddab, 0x7ff9f986, 0x649595f1, 0xbbe6e65d
134.word	0xf2c7c735, 0x0924242d, 0xc61717d1, 0x6fb9b9d6, 0xc51b1bde, 0x86121294, 0x18606078, 0xf3c3c330, 0x7cf5f589
135.word	0xefb3b35c, 0x3ae8e8d2, 0xdf7373ac, 0x4c353579, 0x208080a0, 0x78e5e59d, 0xedbbbb56, 0x5e7d7d23, 0x3ef8f8c6
136.word	0xd45f5f8b, 0xc82f2fe7, 0x39e4e4dd, 0x49212168
137
138.Ltbox2:
139.word	0x5b5b8ed5, 0x4242d092, 0xa7a74dea, 0xfbfb06fd, 0x3333fccf, 0x878765e2, 0xf4f4c93d, 0xdede6bb5, 0x58584e16
140.word	0xdada6eb4, 0x50504414, 0x0b0bcac1, 0xa0a08828, 0xefef17f8, 0xb0b09c2c, 0x14141105, 0xacac872b, 0x9d9dfb66
141.word	0x6a6af298, 0xd9d9ae77, 0xa8a8822a, 0xfafa46bc, 0x10101404, 0x0f0fcfc0, 0xaaaa02a8, 0x11115445, 0x4c4c5f13
142.word	0x9898be26, 0x25256d48, 0x1a1a9e84, 0x18181e06, 0x6666fd9b, 0x7272ec9e, 0x09094a43, 0x41411051, 0xd3d324f7
143.word	0x4646d593, 0xbfbf53ec, 0x6262f89a, 0xe9e9927b, 0xccccff33, 0x51510455, 0x2c2c270b, 0xd0d4f42,  0xb7b759ee
144.word	0x3f3ff3cc, 0xb2b21cae, 0x8989ea63, 0x939374e7, 0xcece7fb1, 0x70706c1c, 0xa6a60dab, 0x2727edca, 0x20202808
145.word	0xa3a348eb, 0x5656c197, 0x02028082, 0x7f7fa3dc, 0x5252c496, 0xebeb12f9, 0xd5d5a174, 0x3e3eb38d, 0xfcfcc33f
146.word	0x9a9a3ea4, 0x1d1d5b46, 0x1c1c1b07, 0x9e9e3ba5, 0xf3f30cff, 0xcfcf3ff0, 0xcdcdbf72, 0x5c5c4b17, 0xeaea52b8
147.word	0x0e0e8f81, 0x65653d58, 0xf0f0cc3c, 0x64647d19, 0x9b9b7ee5, 0x16169187, 0x3d3d734e, 0xa2a208aa, 0xa1a1c869
148.word	0xadadc76a, 0x06068583, 0xcaca7ab0, 0xc5c5b570, 0x9191f465, 0x6b6bb2d9, 0x2e2ea789, 0xe3e318fb, 0xafaf47e8
149.word	0x3c3c330f, 0x2d2d674a, 0xc1c1b071, 0x59590e57, 0x7676e99f, 0xd4d4e135, 0x7878661e, 0x9090b424, 0x3838360e
150.word	0x7979265f, 0x8d8def62, 0x61613859, 0x474795d2, 0x8a8a2aa0, 0x9494b125, 0x8888aa22, 0xf1f18c7d, 0xececd73b
151.word	0x04040501, 0x8484a521, 0xe1e19879, 0x1e1e9b85, 0x535384d7, 0x00000000, 0x19195e47, 0x5d5d0b56, 0x7e7ee39d
152.word	0x4f4f9fd0, 0x9c9cbb27, 0x49491a53, 0x31317c4d, 0xd8d8ee36, 0x08080a02, 0x9f9f7be4, 0x828220a2, 0x1313d4c7
153.word	0x2323e8cb, 0x7a7ae69c, 0xabab42e9, 0xfefe43bd, 0x2a2aa288, 0x4b4b9ad1, 0x01014041, 0x1f1fdbc4, 0xe0e0d838
154.word	0xd6d661b7, 0x8e8e2fa1, 0xdfdf2bf4, 0xcbcb3af1, 0x3b3bf6cd, 0xe7e71dfa, 0x8585e560, 0x54544115, 0x868625a3
155.word	0x838360e3, 0xbaba16ac, 0x7575295c, 0x929234a6, 0x6e6ef799, 0xd0d0e434, 0x6868721a, 0x55550154, 0xb6b619af
156.word	0x4e4edf91, 0xc8c8fa32, 0xc0c0f030, 0xd7d721f6, 0x3232bc8e, 0xc6c675b3, 0x8f8f6fe0, 0x7474691d, 0xdbdb2ef5
157.word	0x8b8b6ae1, 0xb8b8962e, 0x0a0a8a80, 0x9999fe67, 0x2b2be2c9, 0x8181e061, 0x0303c0c3, 0xa4a48d29, 0x8c8caf23
158.word	0xaeae07a9, 0x3434390d, 0x4d4d1f52, 0x3939764f, 0xbdbdd36e, 0x575781d6, 0x6f6fb7d8, 0xdcdceb37, 0x15155144
159.word	0x7b7ba6dd, 0xf7f709fe, 0x3a3ab68c, 0xbcbc932f, 0x0c0c0f03, 0xffff03fc, 0xa9a9c26b, 0xc9c9ba73, 0xb5b5d96c
160.word	0xb1b1dc6d, 0x6d6d375a, 0x45451550, 0x3636b98f, 0x6c6c771b, 0xbebe13ad, 0x4a4ada90, 0xeeee57b9, 0x7777a9de
161.word	0xf2f24cbe, 0xfdfd837e, 0x44445511, 0x6767bdda, 0x71712c5d, 0x05054540, 0x7c7c631f, 0x40405010, 0x6969325b
162.word	0x6363b8db, 0x2828220a, 0x0707c5c2, 0xc4c4f531, 0x2222a88a, 0x969631a7, 0x3737f9ce, 0xeded977a, 0xf6f649bf
163.word	0xb4b4992d, 0xd1d1a475, 0x434390d3, 0x48485a12, 0xe2e258ba, 0x979771e6, 0xd2d264b6, 0xc2c270b2, 0x2626ad8b
164.word	0xa5a5cd68, 0x5e5ecb95, 0x2929624b, 0x30303c0c, 0x5a5ace94, 0xddddab76, 0xf9f9867f, 0x9595f164, 0xe6e65dbb
165.word	0xc7c735f2, 0x24242d09, 0x1717d1c6, 0xb9b9d66f, 0x1b1bdec5, 0x12129486, 0x60607818, 0xc3c330f3, 0xf5f5897c
166.word	0xb3b35cef, 0xe8e8d23a, 0x7373acdf, 0x3535794c, 0x8080a020, 0xe5e59d78, 0xbbbb56ed, 0x7d7d235e, 0xf8f8c63e
167.word	0x5f5f8bd4, 0x2f2fe7c8, 0xe4e4dd39, 0x21216849
168
169.Ltbox3:
170.word	0x5b8ed55b, 0x42d09242, 0xa74deaa7, 0xfb06fdfb, 0x33fccf33, 0x8765e287, 0xf4c93df4, 0xde6bb5de, 0x584e1658
171.word	0xda6eb4da, 0x50441450, 0x0bcac10b, 0xa08828a0, 0xef17f8ef, 0xb09c2cb0, 0x14110514, 0xac872bac, 0x9dfb669d
172.word	0x6af2986a, 0xd9ae77d9, 0xa8822aa8, 0xfa46bcfa, 0x10140410, 0x0fcfc00f, 0xaa02a8aa, 0x11544511, 0x4c5f134c
173.word	0x98be2698, 0x256d4825, 0x1a9e841a, 0x181e0618, 0x66fd9b66, 0x72ec9e72, 0x094a4309, 0x41105141, 0xd324f7d3
174.word	0x46d59346, 0xbf53ecbf, 0x62f89a62, 0xe9927be9, 0xccff33cc, 0x51045551, 0x2c270b2c, 0x0d4f420d, 0xb759eeb7
175.word	0x3ff3cc3f, 0xb21caeb2, 0x89ea6389, 0x9374e793, 0xce7fb1ce, 0x706c1c70, 0xa60daba6, 0x27edca27, 0x20280820
176.word	0xa348eba3, 0x56c19756, 0x02808202, 0x7fa3dc7f, 0x52c49652, 0xeb12f9eb, 0xd5a174d5, 0x3eb38d3e, 0xfcc33ffc
177.word	0x9a3ea49a, 0x1d5b461d, 0x1c1b071c, 0x9e3ba59e, 0xf30cfff3, 0xcf3ff0cf, 0xcdbf72cd, 0x5c4b175c, 0xea52b8ea
178.word	0x0e8f810e, 0x653d5865, 0xf0cc3cf0, 0x647d1964, 0x9b7ee59b, 0x16918716, 0x3d734e3d, 0xa208aaa2, 0xa1c869a1
179.word	0xadc76aad, 0x06858306, 0xca7ab0ca, 0xc5b570c5, 0x91f46591, 0x6bb2d96b, 0x2ea7892e, 0xe318fbe3, 0xaf47e8af
180.word	0x3c330f3c, 0x2d674a2d, 0xc1b071c1, 0x590e5759, 0x76e99f76, 0xd4e135d4, 0x78661e78, 0x90b42490, 0x38360e38
181.word	0x79265f79, 0x8def628d, 0x61385961, 0x4795d247, 0x8a2aa08a, 0x94b12594, 0x88aa2288, 0xf18c7df1, 0xecd73bec
182.word	0x04050104, 0x84a52184, 0xe19879e1, 0x1e9b851e, 0x5384d753, 0x00000000, 0x195e4719, 0x5d0b565d, 0x7ee39d7e
183.word	0x4f9fd04f, 0x9cbb279c, 0x491a5349, 0x317c4d31, 0xd8ee36d8, 0x080a0208, 0x9f7be49f, 0x8220a282, 0x13d4c713
184.word	0x23e8cb23, 0x7ae69c7a, 0xab42e9ab, 0xfe43bdfe, 0x2aa2882a, 0x4b9ad14b, 0x01404101, 0x1fdbc41f, 0xe0d838e0
185.word	0xd661b7d6, 0x8e2fa18e, 0xdf2bf4df, 0xcb3af1cb, 0x3bf6cd3b, 0xe71dfae7, 0x85e56085, 0x54411554, 0x8625a386
186.word	0x8360e383, 0xba16acba, 0x75295c75, 0x9234a692, 0x6ef7996e, 0xd0e434d0, 0x68721a68, 0x55015455, 0xb619afb6
187.word	0x4edf914e, 0xc8fa32c8, 0xc0f030c0, 0xd721f6d7, 0x32bc8e32, 0xc675b3c6, 0x8f6fe08f, 0x74691d74, 0xdb2ef5db
188.word	0x8b6ae18b, 0xb8962eb8, 0x0a8a800a, 0x99fe6799, 0x2be2c92b, 0x81e06181, 0x03c0c303, 0xa48d29a4, 0x8caf238c
189.word	0xae07a9ae, 0x34390d34, 0x4d1f524d, 0x39764f39, 0xbdd36ebd, 0x5781d657, 0x6fb7d86f, 0xdceb37dc, 0x15514415
190.word	0x7ba6dd7b, 0xf709fef7, 0x3ab68c3a, 0xbc932fbc, 0x0c0f030c, 0xff03fcff, 0xa9c26ba9, 0xc9ba73c9, 0xb5d96cb5
191.word	0xb1dc6db1, 0x6d375a6d, 0x45155045, 0x36b98f36, 0x6c771b6c, 0xbe13adbe, 0x4ada904a, 0xee57b9ee, 0x77a9de77
192.word	0xf24cbef2, 0xfd837efd, 0x44551144, 0x67bdda67, 0x712c5d71, 0x05454005, 0x7c631f7c, 0x40501040, 0x69325b69
193.word	0x63b8db63, 0x28220a28, 0x07c5c207, 0xc4f531c4, 0x22a88a22, 0x9631a796, 0x37f9ce37, 0xed977aed, 0xf649bff6
194.word	0xb4992db4, 0xd1a475d1, 0x4390d343, 0x485a1248, 0xe258bae2, 0x9771e697, 0xd264b6d2, 0xc270b2c2, 0x26ad8b26
195.word	0xa5cd68a5, 0x5ecb955e, 0x29624b29, 0x303c0c30, 0x5ace945a, 0xddab76dd, 0xf9867ff9, 0x95f16495, 0xe65dbbe6
196.word	0xc735f2c7, 0x242d0924, 0x17d1c617, 0xb9d66fb9, 0x1bdec51b, 0x12948612, 0x60781860, 0xc330f3c3, 0xf5897cf5
197.word	0xb35cefb3, 0xe8d23ae8, 0x73acdf73, 0x35794c35, 0x80a02080, 0xe59d78e5, 0xbb56edbb, 0x7d235e7d, 0xf8c63ef8
198.word	0x5f8bd45f, 0x2fe7c82f, 0xe4dd39e4, 0x21684921
199
200.Ltbox4:
201.word	0x8ed55b5b, 0xd0924242, 0x4deaa7a7, 0x06fdfbfb, 0xfccf3333, 0x65e28787, 0xc93df4f4, 0x6bb5dede, 0x4e165858
202.word	0x6eb4dada, 0x44145050, 0xcac10b0b, 0x8828a0a0, 0x17f8efef, 0x9c2cb0b0, 0x11051414, 0x872bacac, 0xfb669d9d
203.word	0xf2986a6a, 0xae77d9d9, 0x822aa8a8, 0x46bcfafa, 0x14041010, 0xcfc00f0f, 0x02a8aaaa, 0x54451111, 0x5f134c4c
204.word	0xbe269898, 0x6d482525, 0x9e841a1a, 0x1e061818, 0xfd9b6666, 0xec9e7272, 0x4a430909, 0x10514141, 0x24f7d3d3
205.word	0xd5934646, 0x53ecbfbf, 0xf89a6262, 0x927be9e9, 0xff33cccc, 0x04555151, 0x270b2c2c, 0x4f420d0d, 0x59eeb7b7
206.word	0xf3cc3f3f, 0x1caeb2b2, 0xea638989, 0x74e79393, 0x7fb1cece, 0x6c1c7070, 0x0daba6a6, 0xedca2727, 0x28082020
207.word	0x48eba3a3, 0xc1975656, 0x80820202, 0xa3dc7f7f, 0xc4965252, 0x12f9ebeb, 0xa174d5d5, 0xb38d3e3e, 0xc33ffcfc
208.word	0x3ea49a9a, 0x5b461d1d, 0x1b071c1c, 0x3ba59e9e, 0x0cfff3f3, 0x3ff0cfcf, 0xbf72cdcd, 0x4b175c5c, 0x52b8eaea
209.word	0x8f810e0e, 0x3d586565, 0xcc3cf0f0, 0x7d196464, 0x7ee59b9b, 0x91871616, 0x734e3d3d, 0x08aaa2a2, 0xc869a1a1
210.word	0xc76aadad, 0x85830606, 0x7ab0caca, 0xb570c5c5, 0xf4659191, 0xb2d96b6b, 0xa7892e2e, 0x18fbe3e3, 0x47e8afaf
211.word	0x330f3c3c, 0x674a2d2d, 0xb071c1c1, 0x0e575959, 0xe99f7676, 0xe135d4d4, 0x661e7878, 0xb4249090, 0x360e3838
212.word	0x265f7979, 0xef628d8d, 0x38596161, 0x95d24747, 0x2aa08a8a, 0xb1259494, 0xaa228888, 0x8c7df1f1, 0xd73becec
213.word	0x05010404, 0xa5218484, 0x9879e1e1, 0x9b851e1e, 0x84d75353, 0x00000000, 0x5e471919, 0x0b565d5d, 0xe39d7e7e
214.word	0x9fd04f4f, 0xbb279c9c, 0x1a534949, 0x7c4d3131, 0xee36d8d8, 0x0a020808, 0x7be49f9f, 0x20a28282, 0xd4c71313
215.word	0xe8cb2323, 0xe69c7a7a, 0x42e9abab, 0x43bdfefe, 0xa2882a2a, 0x9ad14b4b, 0x40410101, 0xdbc41f1f, 0xd838e0e0
216.word	0x61b7d6d6, 0x2fa18e8e, 0x2bf4dfdf, 0x3af1cbcb, 0xf6cd3b3b, 0x1dfae7e7, 0xe5608585, 0x41155454, 0x25a38686
217.word	0x60e38383, 0x16acbaba, 0x295c7575, 0x34a69292, 0xf7996e6e, 0xe434d0d0, 0x721a6868, 0x01545555, 0x19afb6b6
218.word	0xdf914e4e, 0xfa32c8c8, 0xf030c0c0, 0x21f6d7d7, 0xbc8e3232, 0x75b3c6c6, 0x6fe08f8f, 0x691d7474, 0x2ef5dbdb
219.word	0x6ae18b8b, 0x962eb8b8, 0x8a800a0a, 0xfe679999, 0xe2c92b2b, 0xe0618181, 0xc0c30303, 0x8d29a4a4, 0xaf238c8c
220.word	0x07a9aeae, 0x390d3434, 0x1f524d4d, 0x764f3939, 0xd36ebdbd, 0x81d65757, 0xb7d86f6f, 0xeb37dcdc, 0x51441515
221.word	0xa6dd7b7b, 0x09fef7f7, 0xb68c3a3a, 0x932fbcbc, 0x0f030c0c, 0x03fcffff, 0xc26ba9a9, 0xba73c9c9, 0xd96cb5b5
222.word	0xdc6db1b1, 0x375a6d6d, 0x15504545, 0xb98f3636, 0x771b6c6c, 0x13adbebe, 0xda904a4a, 0x57b9eeee, 0xa9de7777
223.word	0x4cbef2f2, 0x837efdfd, 0x55114444, 0xbdda6767, 0x2c5d7171, 0x45400505, 0x631f7c7c, 0x50104040, 0x325b6969
224.word	0xb8db6363, 0x220a2828, 0xc5c20707, 0xf531c4c4, 0xa88a2222, 0x31a79696, 0xf9ce3737, 0x977aeded, 0x49bff6f6
225.word	0x992db4b4, 0xa475d1d1, 0x90d34343, 0x5a124848, 0x58bae2e2, 0x71e69797, 0x64b6d2d2, 0x70b2c2c2, 0xad8b2626
226.word	0xcd68a5a5, 0xcb955e5e, 0x624b2929, 0x3c0c3030, 0xce945a5a, 0xab76dddd, 0x867ff9f9, 0xf1649595, 0x5dbbe6e6
227.word	0x35f2c7c7, 0x2d092424, 0xd1c61717, 0xd66fb9b9, 0xdec51b1b, 0x94861212, 0x78186060, 0x30f3c3c3, 0x897cf5f5
228.word	0x5cefb3b3, 0xd23ae8e8, 0xacdf7373, 0x794c3535, 0xa0208080, 0x9d78e5e5, 0x56edbbbb, 0x235e7d7d, 0xc63ef8f8
229.word	0x8bd45f5f, 0xe7c82f2f, 0xdd39e4e4, 0x68492121
230
231#ifdef HITLS_BIG_ENDIAN
232.Lxts_magic:
233	.quad	0x0101010101010101,0x0101010101010187
234
235.Lsbox_magic:
236    .quad 0x0306090c0f020508,0x0b0e0104070a0d00
237    .quad 0x22581a6002783a40,0x62185a2042387a00
238    .quad 0xc10bb67c4a803df7,0x15df62a89e54e923
239    .quad 0x1407c6d56c7fbead,0xb9aa6b78c1d21300
240    .quad 0xe383c1a1fe9edcbc,0x6404462679195b3b
241    .quad 0x0E0D0C0F0A09080B,0x0605040702010003
242    .quad 0x0D0C0F0E09080B0A,0x0504070601000302
243    .quad 0x0C0F0E0D080B0A09,0x0407060500030201
244#else
245.Lxts_magic:
246	.quad	0x0101010101010187,0x0101010101010101
247
248.Lsbox_magic:
249    .quad 0x0b0e0104070a0d00,0x0306090c0f020508
250    .quad 0x62185a2042387a00,0x22581a6002783a40
251    .quad 0x15df62a89e54e923,0xc10bb67c4a803df7
252    .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
253    .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
254    .quad 0x0605040702010003,0x0E0D0C0F0A09080B
255    .quad 0x0504070601000302,0x0D0C0F0E09080B0A
256    .quad 0x0407060500030201,0x0C0F0E0D080B0A09
257#endif
258
259.macro LoadSbox
260	adrp	x15,.Lsbox_magic
261	add	x15,x15,:lo12:.Lsbox_magic
262    ldr MaskQ,      [x15]
263    ldr TAHMatQ,    [x15, #16]
264    ldr TALMatQ,    [x15, #32]
265    ldr ATAHMatQ,   [x15, #48]
266    ldr ATALMatQ,   [x15, #64]
267    ldr vtmp5q,     [x15, #80]
268    ldr vtmp6q,     [x15, #96]
269    ldr vtmp7q,     [x15, #112]
270.endm
271
272.macro round x1, x2, x3, x4, rk
273	eor	word0,\x2, \x3
274	eor	word0, word0, \rk
275	eor	word0, word0, \x4
276
277	and	word1, word0, #0xff
278	ldr	word1, [tbox0,xword1,lsl #2]
279	eor	\x1, word1, \x1
280
281	ubfx word1, word0,#8,#8
282	ldr	word1, [tbox1, xword1, lsl #2]
283	eor	\x1, word1, \x1
284
285	ubfx word1, word0, #16, #8
286	ldr	word1,[tbox2, xword1, lsl #2]
287	eor	\x1, word1, \x1
288
289	lsr	word1, word0, #24
290	ldr	word1, [tbox3, xword1, lsl #2]
291	eor	\x1, word1, \x1
292.endm
293
294.macro EncRound4 offset1, offset2
295	ldp	word2, word3,[rks, \offset1]
296	round w8, w9, w10, w11, word2
297	round w9, w10, w11, w8, word3
298	ldp	word2, word3,[rks, \offset2]
299	round w10, w11, w8, w9, word2
300	round w11, w8, w9, w10, word3
301.endm
302
303.macro EncRound
304	EncRound4 0, 8
305	EncRound4 16, 24
306	EncRound4 32, 40
307	EncRound4 48, 56
308	EncRound4 64, 72
309	EncRound4 80, 88
310	EncRound4 96, 104
311	EncRound4 112, 120
312.endm
313
314.macro transpose dat0s, dat1s, dat2s, dat3s, dat0d, dat1d, dat2d, dat3d, vt0s, vt1s, vt2s, vt3s, vt0d, vt1d, vt2d, vt3d
315	zip1	\vt0s, \dat0s, \dat1s
316	zip2	\vt1s, \dat0s, \dat1s
317	zip1	\vt2s, \dat2s, \dat3s
318	zip2	\vt3s, \dat2s, \dat3s
319	zip1	\dat0d, \vt0d, \vt2d
320	zip2	\dat1d, \vt0d, \vt2d
321	zip1	\dat2d, \vt1d, \vt3d
322	zip2	\dat3d, \vt1d, \vt3d
323.endm
324
325.macro Encrypt1blkNorevCtr
326	mov	w8,ivec.s[0]
327	mov	w9,ivec.s[1]
328	mov	w10,ivec.s[2]
329	mov	w11,ivec.s[3]
330	EncRound
331	mov	ivec.s[0],w11
332	mov	ivec.s[1],w10
333	mov	ivec.s[2],w9
334	mov	ivec.s[3],w8
335#ifndef HITLS_BIG_ENDIAN
336	rev32 v3.16b,v3.16b
337#endif
338.endm
339
340# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
341.macro MulMatrix x, higherMat, lowerMat, tmp
342	ushr	\tmp, \x, 4
343	and		\x, \x, ANDMaskV.16b
344	tbl		\x, {\lowerMat}, \x
345	tbl		\tmp, {\higherMat}, \tmp
346	eor		\x, \x, \tmp
347.endm
348
349# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
350.macro MulMatrixOut x, higherMat, lowerMat, tmp, out
351	ushr	\tmp, \x, 4
352	and		\x, \x, ANDMaskV.16b
353	tbl		\x, {\lowerMat}, \x
354	tbl		\tmp, {\higherMat}, \tmp
355	eor		\out, \x, \tmp
356.endm
357
358# Sbox operations for 4-lane of words
359.macro Sbox dat, dat2
360	movi ANDMaskV.16b, #0x0f
361	// optimize Sbox using AESE instruction
362	tbl	v0.16b, {\dat}, MaskV.16b
363	MulMatrix v0.16b, TAHMatV.16b, TALMatV.16b, v24.16b
364
365	eor v1.16b, v1.16b, v1.16b
366	aese v0.16b, v1.16b
367
368	MulMatrix v0.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b
369
370	mov	\dat, v0.16b
371
372	// linear transformation
373	ushr	v0.4s, \dat2,32-2
374	ushr	v1.4s, \dat2,32-10
375	ushr	v2.4s, \dat2,32-18
376	ushr	v3.4s, \dat2,32-24
377	sli	v0.4s, \dat2,2
378	sli	v1.4s, \dat2,10
379	sli	v2.4s, \dat2,18
380	sli	v3.4s, \dat2,24
381	eor	v24.16b, v0.16b, \dat
382	eor	v24.16b, v24.16b, v1.16b
383	eor	\dat, v2.16b, v3.16b
384	eor	\dat, \dat, v24.16b
385.endm
386
387# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
388.macro Sm44blks kptr
389	ldp	wtmp0, wtmp1,[\kptr],8
390	dup	rk0.4s, wtmp0
391	dup	rk1.4s, wtmp1
392
393	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
394	eor	rka.16b, v6.16b, v7.16b
395	eor	rk0.16b, v5.16b, rk0.16b
396	eor	rk0.16b, rka.16b, rk0.16b
397
398	Sbox rk0.16b, rk0.4s
399
400	eor	v4.16b, v4.16b, rk0.16b
401
402	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
403	eor	rka.16b, rka.16b, v4.16b
404	eor	rk1.16b, rka.16b, rk1.16b
405
406	Sbox rk1.16b, rk1.4s
407
408	ldp	wtmp0, wtmp1,[\kptr],8
409	eor	v5.16b,v5.16b, rk1.16b
410
411	dup	rk0.4s, wtmp0
412	dup	rk1.4s, wtmp1
413
414	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
415	eor	rka.16b, v4.16b, v5.16b
416	eor	rk0.16b, v7.16b, rk0.16b
417	eor	rk0.16b, rka.16b, rk0.16b
418
419	Sbox rk0.16b, rk0.4s
420
421	eor	v6.16b, v6.16b, rk0.16b
422
423	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
424	eor	rka.16b, rka.16b, v6.16b
425	eor	rk1.16b, rka.16b, rk1.16b
426
427	Sbox rk1.16b, rk1.4s
428
429	eor	v7.16b, v7.16b, rk1.16b
430.endm
431
432
433.macro Encrypt4blks
434	mov	ptr, rks
435	mov	counter,#8
43610:
437	Sm44blks ptr
438
439	subs counter, counter,#1
440	b.ne 10b
441#ifndef HITLS_BIG_ENDIAN
442	rev32	v3.16b,v4.16b
443	rev32	v2.16b,v5.16b
444	rev32	v1.16b,v6.16b
445	rev32	v0.16b,v7.16b
446#else
447	mov	    v3.16b,v4.16b
448	mov	    v2.16b,v5.16b
449	mov	    v1.16b,v6.16b
450	mov	    v0.16b,v7.16b
451#endif
452.endm
453
454# Sbox operation for 8-lane of words
455.macro SboxDouble dat datx
456	movi ANDMaskV.16b, #0x0f
457	// optimize Sbox using AESE instruction
458	tbl	v0.16b, {rk0.16b}, MaskV.16b
459	tbl	v1.16b, {rk1.16b}, MaskV.16b
460
461	MulMatrix v0.16b, TAHMatV.16b, TALMatV.16b, v24.16b
462	MulMatrix v1.16b, TAHMatV.16b, TALMatV.16b, v24.16b
463	eor vtmp5.16b, vtmp5.16b, vtmp5.16b
464	aese v0.16b,vtmp5.16b
465	aese v1.16b,vtmp5.16b
466	MulMatrixOut v0.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, rk0.16b
467	MulMatrixOut v1.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, rk1.16b
468
469	// linear transformation
470	ushr	v0.4s,rk0.4s,32-2
471	ushr	vtmp5.4s,rk1.4s,32-2
472	ushr	v1.4s,rk0.4s,32-10
473	ushr	v2.4s,rk0.4s,32-18
474	ushr	v3.4s,rk0.4s,32-24
475	sli	v0.4s,rk0.4s,2
476	sli	vtmp5.4s,rk1.4s,2
477	sli	v1.4s,rk0.4s,10
478	sli	v2.4s,rk0.4s,18
479	sli	v3.4s,rk0.4s,24
480	eor	v24.16b,v0.16b,rk0.16b
481	eor	v24.16b,v24.16b,v1.16b
482	eor	rk0.16b,v2.16b,v3.16b
483	eor	rk0.16b,rk0.16b,v24.16b
484	ushr	v1.4s,rk1.4s,32-10
485	ushr	v2.4s,rk1.4s,32-18
486	ushr	v3.4s,rk1.4s,32-24
487	sli	v1.4s,rk1.4s,10
488	sli	v2.4s,rk1.4s,18
489	sli	v3.4s,rk1.4s,24
490	eor	v24.16b,vtmp5.16b,rk1.16b
491	eor	v24.16b,v24.16b,v1.16b
492	eor	rk1.16b,v2.16b,v3.16b
493	eor	rk1.16b,rk1.16b,v24.16b
494.endm
495
496
497.macro SboxThree dat, datx, dat1
498	movi	ANDMaskV.16b, #0x0f
499	// optimize sbox using AESE instruction
500	tbl	v0.16b, {\dat}, MaskV.16b
501	tbl	v1.16b, {\datx}, MaskV.16b
502	tbl	v2.16b, {\dat1}, MaskV.16b
503	eor v3.16b, v3.16b, v3.16b
504
505	MulMatrix v0.16b, TAHMatV.16b, TALMatV.16b, v24.16b
506	MulMatrix v1.16b, TAHMatV.16b, TALMatV.16b, v24.16b
507
508	aese v0.16b, v3.16b
509
510	MulMatrix v2.16b, TAHMatV.16b, TALMatV.16b, v24.16b
511
512	aese v1.16b, v3.16b
513	aese v2.16b, v3.16b
514
515	MulMatrixOut v0.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, \dat
516	MulMatrixOut v1.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, \datx
517	MulMatrixOut v2.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, \dat1
518
519	// linear transformation
520    tbl v0.16b, {\dat},  vtmp5.16b // shitf left 8
521    tbl v1.16b, {\datx}, vtmp5.16b
522    tbl v2.16b, {\dat1}, vtmp5.16b
523
524    tbl v3.16b,  {\dat},   v22.16b // shitf left 16
525    tbl v24.16b,    {\datx},  v22.16b
526    tbl ANDMaskV.16b, {\dat1},  v22.16b
527
528    eor v0.16b, v0.16b, \dat
529    eor v1.16b, v1.16b, \datx
530    eor v2.16b, v2.16b, \dat1
531
532	eor v0.16b, v0.16b, v3.16b
533    eor v1.16b, v1.16b, v24.16b
534    eor v2.16b, v2.16b, ANDMaskV.16b
535
536    shl v3.4s, v0.4s, #2  // shift left by 2 bits, equivalent to v12<<2 xor v12<<10 xor v12<<18
537    sri v3.4s, v0.4s, #30
538    shl v24.4s, v1.4s, #2
539    sri v24.4s, v1.4s, #30
540    shl ANDMaskV.4s, v2.4s, #2
541    sri ANDMaskV.4s, v2.4s, #30
542
543    tbl v0.16b, {\dat},  v23.16b  // shitf left 24
544    tbl v1.16b, {\datx}, v23.16b
545    tbl v2.16b, {\dat1}, v23.16b
546
547	eor \dat, \dat, v3.16b
548	eor \datx, \datx, v24.16b
549	eor \dat1, \dat1, ANDMaskV.16b
550
551    eor \dat, v0.16b, \dat
552    eor \datx, v1.16b, \datx
553    eor \dat1, v2.16b, \dat1
554.endm
555
556.macro Sm48blks kptr
557	ldp	wtmp0, wtmp1,[\kptr],8
558
559	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
560	dup	rk0.4s, wtmp0
561	eor	rka.16b,v6.16b,v7.16b
562	eor	rkb.16b,v10.16b,v11.16b
563	eor	v0.16b,v5.16b,rk0.16b
564	eor	v1.16b,v9.16b,rk0.16b
565	eor	rk0.16b, rka.16b,v0.16b
566	eor	rk1.16b, rkb.16b,v1.16b
567	SboxDouble
568	eor	v4.16b, v4.16b, rk0.16b
569	eor	v8.16b,v8.16b, rk1.16b
570
571	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
572	dup	rk1.4s, wtmp1
573	eor	rka.16b,rka.16b,v4.16b
574	eor	rkb.16b,rkb.16b,v8.16b
575	eor	rk0.16b,rka.16b,rk1.16b
576	eor	rk1.16b,rkb.16b,rk1.16b
577	SboxDouble
578
579	ldp	wtmp0, wtmp1,[\kptr],8
580	eor	v5.16b,v5.16b,rk0.16b
581	eor	v9.16b,v9.16b,rk1.16b
582
583	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
584	dup	rk0.4s, wtmp0
585	eor	rka.16b,v4.16b,v5.16b
586	eor	rkb.16b,v8.16b,v9.16b
587	eor	v0.16b,v7.16b,rk0.16b
588	eor	v1.16b,v11.16b,rk0.16b
589	eor	rk0.16b,rka.16b,v0.16b
590	eor	rk1.16b,rkb.16b,v1.16b
591	SboxDouble
592
593	eor	v6.16b,v6.16b,rk0.16b
594	eor	v10.16b,v10.16b,rk1.16b
595
596	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
597	dup	rk1.4s, wtmp1
598	eor	rka.16b,rka.16b,v6.16b
599	eor	rkb.16b,rkb.16b,v10.16b
600	eor	rk0.16b,rka.16b,rk1.16b
601	eor	rk1.16b,rkb.16b,rk1.16b
602	SboxDouble
603
604	eor	v7.16b,v7.16b,rk0.16b
605	eor	v11.16b,v11.16b,rk1.16b
606.endm
607
608
609.macro Sm412blks kptr
610	ldp	wtmp0,wtmp1,[\kptr],8
611	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
612	dup	rk0.4s,wtmp0
613	eor	rka.16b,v6.16b,v7.16b
614	eor	rkb.16b,v10.16b,v11.16b
615	eor	rkc.16b,v18.16b,v19.16b
616	eor	v0.16b,v5.16b,rk0.16b
617	eor	v1.16b,v9.16b,rk0.16b
618	eor	v2.16b,v17.16b,rk0.16b
619	eor	rk0.16b,rka.16b,v0.16b
620	eor	rk1.16b,rkb.16b,v1.16b
621	eor	rk2.16b,rkc.16b,v2.16b
622
623	SboxThree rk0.16b, rk1.16b, rk2.16b
624
625	eor	v4.16b,v4.16b,rk0.16b
626	eor	v8.16b,v8.16b,rk1.16b
627	eor	v16.16b,v16.16b,rk2.16b
628
629	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
630	dup	rk1.4s,wtmp1
631	eor	rka.16b,rka.16b,v4.16b
632	eor	rkb.16b,rkb.16b,v8.16b
633	eor	rkc.16b,rkc.16b,v16.16b
634	eor	rk0.16b,rka.16b,rk1.16b
635	eor	rk2.16b,rkc.16b,rk1.16b
636	eor	rk1.16b,rkb.16b,rk1.16b
637
638	SboxThree rk0.16b, rk1.16b, rk2.16b
639
640	ldp	wtmp0,wtmp1,[\kptr],8
641	eor	v5.16b,v5.16b,rk0.16b
642	eor	v9.16b,v9.16b,rk1.16b
643	eor	v17.16b,v17.16b,rk2.16b
644
645	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
646	dup	rk0.4s,wtmp0
647	eor	rka.16b,v4.16b,v5.16b
648	eor	rkb.16b,v8.16b,v9.16b
649	eor	rkc.16b,v16.16b,v17.16b
650	eor	v0.16b,v7.16b,rk0.16b
651	eor	v1.16b,v11.16b,rk0.16b
652	eor	v2.16b,v19.16b,rk0.16b
653	eor	rk0.16b,rka.16b,v0.16b
654	eor	rk1.16b,rkb.16b,v1.16b
655	eor	rk2.16b,rkc.16b,v2.16b
656
657	SboxThree rk0.16b, rk1.16b, rk2.16b
658
659	eor	v6.16b,v6.16b,rk0.16b
660	eor	v10.16b,v10.16b,rk1.16b
661	eor	v18.16b,v18.16b,rk2.16b
662
663	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
664	dup	rk1.4s,wtmp1
665	eor	rka.16b,rka.16b,v6.16b
666	eor	rkb.16b,rkb.16b,v10.16b
667	eor	rkc.16b,rkc.16b,v18.16b
668	eor	rk0.16b,rka.16b,rk1.16b
669	eor	rk2.16b,rkc.16b,rk1.16b
670	eor	rk1.16b,rkb.16b,rk1.16b
671
672	SboxThree rk0.16b, rk1.16b, rk2.16b
673
674	eor	v7.16b,v7.16b,rk0.16b
675	eor	v11.16b,v11.16b,rk1.16b
676	eor	v19.16b,v19.16b,rk2.16b
677.endm
678
679
680.macro Encrypt8blks
681	mov	ptr, rks
682	mov	counter, #8
68310:
684	Sm48blks ptr
685
686	subs counter, counter,#1
687	b.ne	10b
688#ifndef HITLS_BIG_ENDIAN
689	rev32	v3.16b,v4.16b
690	rev32	v2.16b,v5.16b
691	rev32	v1.16b,v6.16b
692	rev32	v0.16b,v7.16b
693	rev32	v7.16b,v8.16b
694	rev32	v6.16b,v9.16b
695	rev32	v5.16b,v10.16b
696	rev32	v4.16b,v11.16b
697#else
698	mov 	v3.16b,v4.16b
699	mov 	v2.16b,v5.16b
700	mov 	v1.16b,v6.16b
701	mov 	v0.16b,v7.16b
702	mov 	v7.16b,v8.16b
703	mov 	v6.16b,v9.16b
704	mov 	v5.16b,v10.16b
705	mov 	v4.16b,v11.16b
706#endif
707.endm
708
709.macro Encrypt12blks
710	mov	ptr, rks
711	mov	counter, #8
71210:
713	Sm412blks ptr
714
715	subs	counter,counter,#1
716	b.ne	10b
717	// last reverse transform
718#ifndef HITLS_BIG_ENDIAN
719	rev32	v3.16b,v4.16b
720	rev32	v2.16b,v5.16b
721	rev32	v1.16b,v6.16b
722	rev32	v0.16b,v7.16b
723
724	rev32	v7.16b,v8.16b
725	rev32	v6.16b,v9.16b
726	rev32	v5.16b,v10.16b
727	rev32	v4.16b,v11.16b
728
729	rev32	v11.16b,v16.16b
730	rev32	v10.16b,v17.16b
731	rev32	v9.16b,v18.16b
732	rev32	v8.16b,v19.16b
733#else
734	mov	v3.16b,v4.16b
735	mov	v2.16b,v5.16b
736	mov	v1.16b,v6.16b
737	mov	v0.16b,v7.16b
738
739	mov	v7.16b,v8.16b
740	mov	v6.16b,v9.16b
741	mov	v5.16b,v10.16b
742	mov	v4.16b,v11.16b
743
744	mov	v11.16b,v16.16b
745	mov	v10.16b,v17.16b
746	mov	v9.16b,v18.16b
747	mov	v8.16b,v19.16b
748#endif
749.endm
750
751.text
752.type	Sm4Enc4blks,%function
753.align	4
754Sm4Enc4blks:
755AARCH64_PACIASP
756	Encrypt4blks
757AARCH64_AUTIASP
758	ret
759.size	Sm4Enc4blks,.-Sm4Enc4blks
760
761.type	Sm4Enc8blks,%function
762.align	4
763Sm4Enc8blks:
764AARCH64_PACIASP
765	Encrypt8blks
766AARCH64_AUTIASP
767	ret
768.size	Sm4Enc8blks,.-Sm4Enc8blks
769
770.type	Sm4Enc12blks,%function
771.align	4
772Sm4Enc12blks:
773AARCH64_PACIASP
774	Encrypt12blks
775AARCH64_AUTIASP
776	ret
777.size	Sm4Enc12blks,.-Sm4Enc12blks
778
779.globl	Vpsm4EcbEncrypt
780.type	Vpsm4EcbEncrypt,%function
781.align	5
782Vpsm4EcbEncrypt:
783AARCH64_PACIASP
784	// convert length into blocks
785	lsr	x2,x2,4
786	stp	d8,d9,[sp,#-112]!
787	stp	d10,d11,[sp,#16]
788	stp	d12,d13,[sp,#32]
789	stp	d14,d15,[sp,#48]
790	stp	x29,x30,[sp,#64]
791	stp	x19,x20,[sp,#80]
792	stp	x21,x22,[sp,#96]
793	LoadSbox
794
795.Lecb_12_blocks_process:
796	cmp	blocks,#12
797	b.lt  .Lecb_8_blocks_process
798	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[inp],#64
799	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[inp],#64
800	ld4	{v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64
801
802#ifndef HITLS_BIG_ENDIAN
803	rev32	v4.16b,v4.16b
804	rev32	v5.16b,v5.16b
805	rev32	v6.16b,v6.16b
806	rev32	v7.16b,v7.16b
807
808	rev32	v8.16b,v8.16b
809	rev32	v9.16b,v9.16b
810	rev32	v10.16b,v10.16b
811	rev32	v11.16b,v11.16b
812
813	rev32	v16.16b,v16.16b
814	rev32	v17.16b,v17.16b
815	rev32	v18.16b,v18.16b
816	rev32	v19.16b,v19.16b
817#endif
818
819	bl	Sm4Enc12blks
820	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64
821	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64
822	st4	{v8.4s,v9.4s,v10.4s,v11.4s},[outp],#64
823	subs    blocks,blocks,#12
824	b.gt	.Lecb_12_blocks_process
825	b	100f
826
827.Lecb_8_blocks_process:
828	cmp	blocks, #8
829	b.lt	.Lecb_4_blocks_process
830	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
831	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
832#ifndef HITLS_BIG_ENDIAN
833	rev32	v4.16b,v4.16b
834	rev32	v5.16b,v5.16b
835	rev32	v6.16b,v6.16b
836	rev32	v7.16b,v7.16b
837	rev32	v8.16b,v8.16b
838	rev32	v9.16b,v9.16b
839	rev32	v10.16b,v10.16b
840	rev32	v11.16b,v11.16b
841#endif
842	bl	Sm4Enc8blks
843	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
844	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
845	subs	blocks,blocks,#8
846	b.gt	.Lecb_8_blocks_process
847	b	100f
848.Lecb_4_blocks_process:
849	cmp	blocks,#4
850	b.lt	1f
851	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
852#ifndef HITLS_BIG_ENDIAN
853	rev32	v4.16b, v4.16b
854	rev32	v5.16b, v5.16b
855	rev32	v6.16b, v6.16b
856	rev32	v7.16b, v7.16b
857#endif
858	bl	Sm4Enc4blks
859	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
860	sub	blocks,blocks,#4
8611:
862	// process last block
863	cmp	blocks,#1
864	b.lt	100f
865	b.gt	1f
866
867	adrp	 x19, .Ltbox1
868    add x19,x19,:lo12:.Ltbox1
869	adrp	 x20, .Ltbox2
870    add x20,x20,:lo12:.Ltbox2
871	adrp	 x21, .Ltbox3
872    add x21,x21,:lo12:.Ltbox3
873	adrp	 x22, .Ltbox4
874    add x22,x22,:lo12:.Ltbox4
875
876	ldp	w8,w9,[inp],#8
877	ldp	w10,w11,[inp],#8
878#ifndef HITLS_BIG_ENDIAN
879	rev	w8,w8
880	rev	w9,w9
881	rev	w10,w10
882	rev	w11,w11
883#endif
884	EncRound
885#ifndef HITLS_BIG_ENDIAN
886	rev	w8,w8
887	rev	w9,w9
888	rev	w10,w10
889	rev	w11,w11
890#endif
891	stp	w11,w10,[outp]
892	stp	w9,w8,[outp,#8]
893	b	100f
8941:	// process last 2 blocks
895	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[inp],#16
896	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[inp],#16
897	cmp	blocks,#2
898	b.gt	1f
899#ifndef HITLS_BIG_ENDIAN
900	rev32	v4.16b,v4.16b
901	rev32	v5.16b,v5.16b
902	rev32	v6.16b,v6.16b
903	rev32	v7.16b,v7.16b
904#endif
905	bl	Sm4Enc4blks
906	st4	{v0.s,v1.s,v2.s,v3.s}[0],[outp],#16
907	st4	{v0.s,v1.s,v2.s,v3.s}[1],[outp]
908	b	100f //
9091:	//	process last 3 blocks
910	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[inp],#16
911#ifndef HITLS_BIG_ENDIAN
912	rev32	v4.16b,v4.16b
913	rev32	v5.16b,v5.16b
914	rev32	v6.16b,v6.16b
915	rev32	v7.16b,v7.16b
916#endif
917	bl	Sm4Enc4blks
918	st4	{v0.s,v1.s,v2.s,v3.s}[0],[outp],#16
919	st4	{v0.s,v1.s,v2.s,v3.s}[1],[outp],#16
920	st4	{v0.s,v1.s,v2.s,v3.s}[2],[outp]
921100:
922	ldp	d10,d11,[sp,#16]
923	ldp	d12,d13,[sp,#32]
924	ldp	d14,d15,[sp,#48]
925	ldp	x29,x30,[sp,#64]
926	ldp	x19,x20,[sp,#80]
927	ldp	x21,x22,[sp,#96]
928	ldp	d8,d9,[sp],#112
929AARCH64_AUTIASP
930	ret
931.size	Vpsm4EcbEncrypt,.-Vpsm4EcbEncrypt
932
933
934.globl	Vpsm4CbcEncrypt
935.type	Vpsm4CbcEncrypt,%function
936.align	5
937Vpsm4CbcEncrypt:
938AARCH64_PACIASP
939	lsr	len,len,4
940	stp	x29,x30,[sp,#-48]!
941	stp	x19,x20,[sp,#16]
942	stp	x21,x22,[sp,#32]
943
944	// load tbox
945	adrp	 x19, .Ltbox1
946    add x19,x19,:lo12:.Ltbox1
947	adrp	 x20, .Ltbox2
948    add x20,x20,:lo12:.Ltbox2
949	adrp	 x21, .Ltbox3
950    add x21,x21,:lo12:.Ltbox3
951	adrp	 x22, .Ltbox4
952    add x22,x22,:lo12:.Ltbox4
953
954	cbz	w5,.Ldec
955
956	// load iv
957	ldp	w8,w9,[ivp]
958	ldp	w10,w11,[ivp,#8]
959.Lcbc_1_block_enc:
960	subs	blocks,blocks,#1
961	b.lt	2f
962	ldp	w6,w7,[inp],#8
963	ldp	w16,w17,[inp],#8
964	eor	w8,w8,w6
965	eor	w9,w9,w7
966	eor	w10,w10,w16
967	eor	w11,w11,w17
968#ifndef HITLS_BIG_ENDIAN
969	rev	w8,w8
970	rev	w9,w9
971	rev	w10,w10
972	rev	w11,w11
973#endif
974	EncRound
975#ifndef HITLS_BIG_ENDIAN
976	rev	w8,w8
977	rev	w9,w9
978	rev	w10,w10
979	rev	w11,w11
980#endif
981	// reverse to store
982	mov	w6,w8
983	mov	w8,w11
984	mov	w11,w6
985	mov	w7,w9
986	mov	w9,w10
987	mov	w10,w7
988
989	stp	w8,w9,[outp],#8
990	stp	w10,w11,[outp],#8
991	b	.Lcbc_1_block_enc
9922:
993	// save back IV
994	stp	w8,w9,[ivp]
995	stp	w10,w11,[ivp,#8]
996
997	ldp	x19,x20,[sp,#16]
998	ldp	x21,x22,[sp,#32]
999	ldp	x29,x30,[sp],#48
1000AARCH64_AUTIASP
1001	ret
1002
1003.Ldec:
1004	LoadSbox
1005	// decryption mode starts
1006	stp	d8,d9,[sp,#-64]!
1007	stp	d10,d11,[sp,#16]
1008	stp	d12,d13,[sp,#32]
1009	stp	d14,d15,[sp,#48]
1010
1011.Lcbc_12_blocks_dec:
1012	cmp	w2,#12
1013	b.lt	.Lcbc_8_blocks_dec
1014	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1015	add	x10,x0,#64
1016	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x10]
1017	add	x10,x10,#64
1018	ld4	{v16.4s,v17.4s,v18.4s,v19.4s},[x10]
1019
1020#ifndef HITLS_BIG_ENDIAN
1021	rev32	v4.16b,v4.16b
1022	rev32	v5.16b,v5.16b
1023	rev32	v6.16b,v6.16b
1024	rev32	v7.16b,v7.16b
1025	rev32	v8.16b,v8.16b
1026	rev32	v9.16b,v9.16b
1027	rev32	v10.16b,v10.16b
1028	rev32	v11.16b,v11.16b
1029	rev32	v16.16b,v16.16b
1030	rev32	v17.16b,v17.16b
1031	rev32	v18.16b,v18.16b
1032	rev32	v19.16b,v19.16b
1033#endif
1034	bl	Sm4Enc12blks
1035	// transpose to xor iv
1036	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d
1037	transpose v4.4s,v5.4s,v6.4s,v7.4s,v4.2d,v5.2d,v6.2d,v7.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d
1038	transpose v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d
1039	ld1	{ivec1.4s},[ivp]
1040	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64
1041	eor	v0.16b,v0.16b,ivec1.16b
1042	ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64
1043	eor	v1.16b,v1.16b,v16.16b
1044	eor	v2.16b,v2.16b,v17.16b
1045	eor	v3.16b,v3.16b,v18.16b
1046	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64
1047
1048	eor v4.16b,v4.16b,v19.16b
1049	eor v5.16b,v5.16b,v12.16b
1050	eor v6.16b,v6.16b,v13.16b
1051	eor v7.16b,v7.16b,v14.16b
1052	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64
1053
1054	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64
1055	eor v8.16b,v8.16b,v15.16b
1056	eor v9.16b,v9.16b,v16.16b
1057	eor v10.16b,v10.16b,v17.16b
1058	eor v11.16b,v11.16b,v18.16b
1059	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[outp],#64
1060	// save back iv
1061	st1	{v19.4s}, [ivp]
1062
1063	subs    blocks,blocks,#12
1064	b.gt	.Lcbc_12_blocks_dec
1065	b	100f
1066
1067.Lcbc_8_blocks_dec:
1068	cmp	blocks,#8
1069	b.lt	1f
1070	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[inp]
1071	add	ptr, inp, #64
1072	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[ptr]
1073
1074#ifndef HITLS_BIG_ENDIAN
1075	rev32	v4.16b,v4.16b
1076	rev32	v5.16b,v5.16b
1077	rev32	v6.16b,v6.16b
1078	rev32	v7.16b,v7.16b
1079	rev32	v8.16b,v8.16b
1080	rev32	v9.16b,v9.16b
1081	rev32	v10.16b,v10.16b
1082	rev32	v11.16b,v11.16b
1083#endif
1084	bl	Sm4Enc8blks
1085	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
1086	transpose v4.4s,v5.4s,v6.4s,v7.4s,v4.2d,v5.2d,v6.2d,v7.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
1087	ld1	{ivec1.4s},[ivp]
1088	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[inp],#64
1089	// note ivec1 and v15 are resuing the same register
1090	// care needs to be taken to avoid conflict
1091	eor	v0.16b,v0.16b,ivec1.16b
1092	ld1	{v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64
1093	eor	v1.16b,v1.16b,v8.16b
1094	eor	v2.16b,v2.16b,v9.16b
1095	eor	v3.16b,v3.16b,v10.16b
1096	// save back IV
1097	st1	{v15.4s}, [ivp]
1098	eor	v4.16b,v4.16b,v11.16b
1099	eor	v5.16b,v5.16b,v12.16b
1100	eor	v6.16b,v6.16b,v13.16b
1101	eor	v7.16b,v7.16b,v14.16b
1102	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64
1103	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64
1104	subs	blocks,blocks,#8
1105	b.gt	.Lcbc_8_blocks_dec
1106	b.eq	100f
11071:
1108	ld1	{ivec1.4s},[ivp]
1109.Lcbc_4_blocks_dec:
1110	cmp	blocks,#4
1111	b.lt	1f
1112	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[inp]
1113#ifndef HITLS_BIG_ENDIAN
1114	rev32	v4.16b,v4.16b
1115	rev32	v5.16b,v5.16b
1116	rev32	v6.16b,v6.16b
1117	rev32	v7.16b,v7.16b
1118#endif
1119	bl	Sm4Enc4blks
1120	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[inp],#64
1121	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
1122	eor	v0.16b,v0.16b,ivec1.16b
1123	eor	v1.16b,v1.16b,v4.16b
1124	orr	v15.16b,v7.16b,v7.16b
1125	eor	v2.16b,v2.16b,v5.16b
1126	eor	v3.16b,v3.16b,v6.16b
1127	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64
1128	// save back IV
1129	st1	{v7.4s}, [ivp]
1130	subs	blocks,blocks,#4
1131	b.gt	.Lcbc_4_blocks_dec
1132	b	100f
11331:	// last block
1134	subs	blocks,blocks,#1
1135	b.lt	100f
1136	b.gt	1f
1137	// load iv
1138	ldp	w6,w7,[ivp]
1139	ldp	w16,w17,[ivp,#8]
1140
1141	ldp	w8,w9,[inp]
1142	ldp	w10,w11,[inp,#8]
1143	// store back iv
1144	stp	w8,w9,[ivp]
1145	stp	w10,w11,[ivp,#8]
1146#ifndef HITLS_BIG_ENDIAN
1147	rev	w8,w8
1148	rev	w9,w9
1149	rev	w10,w10
1150	rev	w11,w11
1151#endif
1152	EncRound
1153#ifndef HITLS_BIG_ENDIAN
1154	rev	w8,w8
1155	rev	w9,w9
1156	rev	w10,w10
1157	rev	w11,w11
1158#endif
1159	eor	w11,w11,w6
1160	eor	w10,w10,w7
1161	eor	w9,w9,w16
1162	eor	w8,w8,w17
1163	stp	w11,w10,[outp],#8
1164	stp	w9,w8,[outp],#8
1165	b	100f
11661:	// last two blocks
1167	ld4	{v4.s,v5.s,v6.s,v7.s}[0], [inp]
1168	add	ptr,inp,#16
1169	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[ptr],#16
1170	subs	blocks,blocks,1
1171	b.gt	1f
1172#ifndef HITLS_BIG_ENDIAN
1173	rev32	v4.16b,v4.16b
1174	rev32	v5.16b,v5.16b
1175	rev32	v6.16b,v6.16b
1176	rev32	v7.16b,v7.16b
1177#endif
1178	bl	Sm4Enc4blks
1179	ld1	{v4.4s,v5.4s},[inp],#32
1180	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
1181	eor	v0.16b,v0.16b,ivec1.16b
1182	eor	v1.16b,v1.16b,v4.16b
1183	st1	{v0.4s,v1.4s},[outp],#32
1184	// save back IV
1185	st1	{v5.4s}, [ivp]
1186	b	100f
11871:	//	last 3 blocks
1188	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[ptr]
1189#ifndef HITLS_BIG_ENDIAN
1190	rev32	v4.16b,v4.16b
1191	rev32	v5.16b,v5.16b
1192	rev32	v6.16b,v6.16b
1193	rev32	v7.16b,v7.16b
1194#endif
1195	bl	Sm4Enc4blks
1196	ld1	{v4.4s,v5.4s,v6.4s},[inp],#48
1197	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
1198	eor	v0.16b,v0.16b,ivec1.16b
1199	eor	v1.16b,v1.16b,v4.16b
1200	eor	v2.16b,v2.16b,v5.16b
1201	st1	{v0.4s,v1.4s,v2.4s},[outp],#48
1202	// save back IV
1203	st1	{v6.4s}, [ivp]
1204100:
1205	ldp	d10,d11,[sp,#16]
1206	ldp	d12,d13,[sp,#32]
1207	ldp	d14,d15,[sp,#48]
1208	ldp	d8,d9,[sp],#64
1209	ldp	x19,x20,[sp,#16]
1210	ldp	x21,x22,[sp,#32]
1211	ldp	x29,x30,[sp],#48
1212AARCH64_AUTIASP
1213	ret
1214.size	Vpsm4CbcEncrypt,.-Vpsm4CbcEncrypt
1215
1216
1217# void Vpsm4Ctr32EncryptBlocks(const uint8_t *in, uint8_t *out, uint64_t blocks, const uint32_t *key, uint8_t *iv);
1218.globl	Vpsm4Ctr32EncryptBlocks
1219.type	Vpsm4Ctr32EncryptBlocks,%function
1220.align	5
1221Vpsm4Ctr32EncryptBlocks:
1222AARCH64_PACIASP
1223	ld1	{ivec.4s},[ivp]
1224#ifndef HITLS_BIG_ENDIAN
1225	rev32	v3.16b,v3.16b
1226#endif
1227	LoadSbox
1228	cmp	blocks,#1
1229	b.ne	1f
1230	// fast processing for one single block without
1231	// context saving overhead
1232	stp	x19,x20,[sp,#-32]!
1233	stp	x21,x22,[sp,#16]
1234	adrp	 x19, .Ltbox1
1235    add x19,x19,:lo12:.Ltbox1
1236	adrp	 x20, .Ltbox2
1237    add x20,x20,:lo12:.Ltbox2
1238	adrp	 x21, .Ltbox3
1239    add x21,x21,:lo12:.Ltbox3
1240	adrp	 x22, .Ltbox4
1241    add x22,x22,:lo12:.Ltbox4
1242
1243	Encrypt1blkNorevCtr
1244
1245	ld1	{v4.4s},[inp]
1246	eor	v4.16b,v4.16b,ivec.16b
1247	st1	{v4.4s},[outp]
1248	ldp	x21,x22,[sp,#16]
1249	ldp	x19,x20,[sp],#32
1250	ldr ctr,[ivp,#12]
1251#ifndef HITLS_BIG_ENDIAN
1252	rev ctr,ctr
1253#endif
1254	add ctr,ctr,#1
1255#ifndef HITLS_BIG_ENDIAN
1256	rev ctr,ctr
1257#endif
1258	str ctr,[ivp,#12]
1259AARCH64_AUTIASP
1260	ret
12611:
1262	stp	d8,d9,[sp,#-112]!
1263	stp	d10,d11,[sp,#16]
1264	stp	d12,d13,[sp,#32]
1265	stp	d14,d15,[sp,#48]
1266	stp	x29,x30,[sp,#64]
1267	stp	x19,x20,[sp,#80]
1268	stp	x21,x22,[sp,#96]
1269	mov	word0, ivec.s[0]
1270	mov	word1, ivec.s[1]
1271	mov	word2, ivec.s[2]
1272	mov	ctr, ivec.s[3]
1273.Lctr32_4_blocks_process:
1274	cmp	blocks,#4
1275	b.lt	1f
1276	dup	v4.4s,word0
1277	dup	v5.4s,word1
1278	dup	v6.4s,word2
1279	mov	v7.s[0],w5
1280	add	ctr,ctr,#1
1281	mov	v7.s[1],ctr
1282	add	ctr,ctr,#1
1283	mov	v7.s[2],ctr
1284	add	ctr,ctr,#1
1285	mov	v7.s[3],ctr
1286	add	ctr,ctr,#1
1287	cmp	blocks,#8
1288	b.ge	.Lctr32_8_blocks_process
1289	bl	Sm4Enc4blks
1290	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64
1291	eor	v0.16b,v0.16b,v12.16b
1292	eor	v1.16b,v1.16b,v13.16b
1293	eor	v2.16b,v2.16b,v14.16b
1294	eor	v3.16b,v3.16b,v15.16b
1295	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64
1296	subs	blocks,blocks,#4
1297	b.ne	.Lctr32_4_blocks_process
1298	b	100f
1299.Lctr32_8_blocks_process:
1300	dup	v8.4s,word0
1301	dup	v9.4s,word1
1302	dup	v10.4s,word2
1303	mov	v11.s[0],ctr
1304	add	ctr,ctr,#1
1305	mov	v11.s[1],ctr
1306	add	ctr,ctr,#1
1307	mov	v11.s[2],ctr
1308	add	ctr,ctr,#1
1309	mov	v11.s[3],ctr
1310	add	ctr,ctr,#1
1311	cmp	blocks,#12
1312	b.ge	.Lctr32_12_blocks_process
1313	bl	Sm4Enc8blks
1314	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64
1315	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[inp],#64
1316	eor	v0.16b,v0.16b,v12.16b
1317	eor	v1.16b,v1.16b,v13.16b
1318	eor	v2.16b,v2.16b,v14.16b
1319	eor	v3.16b,v3.16b,v15.16b
1320	eor	v4.16b,v4.16b,v8.16b
1321	eor	v5.16b,v5.16b,v9.16b
1322	eor	v6.16b,v6.16b,v10.16b
1323	eor	v7.16b,v7.16b,v11.16b
1324	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64
1325	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64
1326	subs	blocks,blocks,#8
1327	b.ne	.Lctr32_4_blocks_process
1328	b	100f
1329.Lctr32_12_blocks_process:
1330	dup	v16.4s,word0
1331	dup	v17.4s,word1
1332	dup	v18.4s,word2
1333	mov	v19.s[0],ctr
1334	add	ctr,ctr,#1
1335	mov	v19.s[1],ctr
1336	add	ctr,ctr,#1
1337	mov	v19.s[2],ctr
1338	add	ctr,ctr,#1
1339	mov	v19.s[3],ctr
1340	add	ctr,ctr,#1
1341	bl	Sm4Enc12blks
1342	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64
1343	eor	v0.16b,v0.16b,v12.16b
1344	eor	v1.16b,v1.16b,v13.16b
1345	eor	v2.16b,v2.16b,v14.16b
1346	eor	v3.16b,v3.16b,v15.16b
1347	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64
1348	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64
1349	eor	v4.16b,v4.16b,v12.16b
1350	eor	v5.16b,v5.16b,v13.16b
1351	eor	v6.16b,v6.16b,v14.16b
1352	eor	v7.16b,v7.16b,v15.16b
1353	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64
1354	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64
1355	eor	v8.16b,v8.16b,v12.16b
1356	eor	v9.16b,v9.16b,v13.16b
1357	eor	v10.16b,v10.16b,v14.16b
1358	eor	v11.16b,v11.16b,v15.16b
1359	st4	{v8.4s,v9.4s,v10.4s,v11.4s},[outp],#64
1360	subs	blocks,blocks,#12
1361	b.ne	.Lctr32_4_blocks_process
1362	b	100f
1363
13641:	//	last block processing
1365	subs	blocks,blocks,#1
1366	b.lt	100f
1367	b.gt	1f
1368	mov	ivec.s[0],word0
1369	mov	ivec.s[1],word1
1370	mov	ivec.s[2],word2
1371	mov	ivec.s[3],ctr
1372	add	ctr,ctr,#1
1373
1374	adrp	 x19, .Ltbox1
1375    add x19,x19,:lo12:.Ltbox1
1376	adrp	 x20, .Ltbox2
1377    add x20,x20,:lo12:.Ltbox2
1378	adrp	 x21, .Ltbox3
1379    add x21,x21,:lo12:.Ltbox3
1380	adrp	 x22, .Ltbox4
1381    add x22,x22,:lo12:.Ltbox4
1382
1383	Encrypt1blkNorevCtr
1384
1385	ld1	{v4.4s},[inp]
1386	eor	v4.16b,v4.16b,ivec.16b
1387	st1	{v4.4s},[outp]
1388	b	100f
1389
13901:	// last 2 blocks processing
1391
1392	dup	v4.4s,word0
1393	dup	v5.4s,word1
1394	dup	v6.4s,word2
1395	mov	v7.s[0],ctr
1396	add	ctr,ctr,#1
1397	mov	v7.s[1],ctr
1398	subs	blocks,blocks,#1
1399	b.ne	1f
1400	add	ctr,ctr,#1
1401	bl	Sm4Enc4blks
1402	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[inp],#16
1403	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[inp],#16
1404	eor	v0.16b,v0.16b,v12.16b
1405	eor	v1.16b,v1.16b,v13.16b
1406	eor	v2.16b,v2.16b,v14.16b
1407	eor	v3.16b,v3.16b,v15.16b
1408	st4	{v0.s,v1.s,v2.s,v3.s}[0],[outp],#16
1409	st4	{v0.s,v1.s,v2.s,v3.s}[1],[outp],#16
1410	b	100f
1411
14121:	//	last 3 blocks processing
1413	add	ctr,ctr,#1
1414	mov	v7.s[2],ctr
1415	add	ctr,ctr,#1
1416	bl	Sm4Enc4blks
1417	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[inp],#16
1418	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[inp],#16
1419	ld4	{v12.s,v13.s,v14.s,v15.s}[2],[inp],#16
1420	eor	v0.16b,v0.16b,v12.16b
1421	eor	v1.16b,v1.16b,v13.16b
1422	eor	v2.16b,v2.16b,v14.16b
1423	eor	v3.16b,v3.16b,v15.16b
1424	st4	{v0.s,v1.s,v2.s,v3.s}[0],[outp],#16
1425	st4	{v0.s,v1.s,v2.s,v3.s}[1],[outp],#16
1426	st4	{v0.s,v1.s,v2.s,v3.s}[2],[outp],#16
1427100:
1428	ldp	d10,d11,[sp,#16]
1429	ldp	d12,d13,[sp,#32]
1430	ldp	d14,d15,[sp,#48]
1431	ldp	x29,x30,[sp,#64]
1432	ldp	x19,x20,[sp,#80]
1433	ldp	x21,x22,[sp,#96]
1434	ldp	d8,d9,[sp],#112
1435#ifndef HITLS_BIG_ENDIAN
1436	rev ctr, ctr
1437#endif
1438	str ctr, [ivp,#12]
1439AARCH64_AUTIASP
1440	ret
1441.size	Vpsm4Ctr32EncryptBlocks,.-Vpsm4Ctr32EncryptBlocks
1442
1443.globl	Vpsm4XtsCipher
1444.type	Vpsm4XtsCipher,%function
1445.align	5
1446Vpsm4XtsCipher:
1447AARCH64_PACIASP
1448	stp	x19, x20, [sp, #-0x10]!
1449	stp	x21, x22, [sp, #-0x10]!
1450	stp	x23, x24, [sp, #-0x10]!
1451	stp	x25, x26, [sp, #-0x10]!
1452	stp	x27, x28, [sp, #-0x10]!
1453	stp	x29, x30, [sp, #-0x10]!
1454	stp	d8, d9, [sp, #-0x10]!
1455	stp	d10, d11, [sp, #-0x10]!
1456	stp	d12, d13, [sp, #-0x10]!
1457	stp	d14, d15, [sp, #-0x10]!
1458	sub	sp, sp, #192
1459	mov	x24, sp
1460	mov	x26,x3
1461	mov	x27,x4
1462	mov	w28,w6
1463	ld1	{v16.4s}, [x5]
1464	LoadSbox
1465
1466	and	x29,x2,#0x0F
1467	// convert length into blocks
1468	lsr	x2,x2,4
1469	cmp	x2,#1
1470	b.lt	.Lxts_cipher_return
1471
1472	cmp	x29,0
1473	// If the encryption/decryption Length is N times of 16,
1474	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks
1475	b.eq	.xts_encrypt_blocks
1476
1477	// If the encryption/decryption length is not N times of 16,
1478	// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
1479	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks
1480	subs	x2,x2,#1
1481	b.eq	.only_2blks_tweak
1482.xts_encrypt_blocks:
1483	rbit	v16.16b,v16.16b
1484#ifdef HITLS_BIG_ENDIAN
1485	rev32	v16.16b,v16.16b
1486#endif
1487	mov	x12,v16.d[0]
1488	mov	x13,v16.d[1]
1489	mov	w7,0x87
1490	extr	x9,x13,x13,#32
1491	extr	x15,x13,x12,#63
1492	and	w8,w7,w9,asr#31
1493	eor	x14,x8,x12,lsl#1
1494	mov	w7,0x87
1495	extr	x9,x15,x15,#32
1496	extr	x17,x15,x14,#63
1497	and	w8,w7,w9,asr#31
1498	eor	x16,x8,x14,lsl#1
1499	mov	w7,0x87
1500	extr	x9,x17,x17,#32
1501	extr	x19,x17,x16,#63
1502	and	w8,w7,w9,asr#31
1503	eor	x18,x8,x16,lsl#1
1504.Lxts_12_blocks_process:
1505	mov	x24, sp
1506	cmp	x2,#12
1507	b.lt	.Lxts_8_blocks_process
1508	mov	v16.d[0],x12
1509	mov	v16.d[1],x13
1510#ifdef HITLS_BIG_ENDIAN
1511	rev32	v16.16b,v16.16b
1512#endif
1513	mov	w7,0x87
1514	extr	x9,x19,x19,#32
1515	extr	x13,x19,x18,#63
1516	and	w8,w7,w9,asr#31
1517	eor	x12,x8,x18,lsl#1
1518	mov	v17.d[0],x14
1519	mov	v17.d[1],x15
1520#ifdef HITLS_BIG_ENDIAN
1521	rev32	v17.16b,v17.16b
1522#endif
1523	mov	w7,0x87
1524	extr	x9,x13,x13,#32
1525	extr	x15,x13,x12,#63
1526	and	w8,w7,w9,asr#31
1527	eor	x14,x8,x12,lsl#1
1528	mov	v18.d[0],x16
1529	mov	v18.d[1],x17
1530#ifdef HITLS_BIG_ENDIAN
1531	rev32	v18.16b,v18.16b
1532#endif
1533	mov	w7,0x87
1534	extr	x9,x15,x15,#32
1535	extr	x17,x15,x14,#63
1536	and	w8,w7,w9,asr#31
1537	eor	x16,x8,x14,lsl#1
1538	mov	v19.d[0],x18
1539	mov	v19.d[1],x19
1540#ifdef HITLS_BIG_ENDIAN
1541	rev32	v19.16b,v19.16b
1542#endif
1543	mov	w7,0x87
1544	extr	x9,x17,x17,#32
1545	extr	x19,x17,x16,#63
1546	and	w8,w7,w9,asr#31
1547	eor	x18,x8,x16,lsl#1
1548	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1549	rbit	v16.16b,v16.16b
1550	rbit	v17.16b,v17.16b
1551	rbit	v18.16b,v18.16b
1552	rbit	v19.16b,v19.16b
1553	eor	v4.16b, v4.16b, v16.16b
1554	eor	v5.16b, v5.16b, v17.16b
1555	eor	v6.16b, v6.16b, v18.16b
1556	eor	v7.16b, v7.16b, v19.16b
1557	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
1558	st1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64
1559	mov	v16.d[0],x12
1560	mov	v16.d[1],x13
1561#ifdef HITLS_BIG_ENDIAN
1562	rev32	v16.16b,v16.16b
1563#endif
1564	mov	w7,0x87
1565	extr	x9,x19,x19,#32
1566	extr	x13,x19,x18,#63
1567	and	w8,w7,w9,asr#31
1568	eor	x12,x8,x18,lsl#1
1569	mov	v17.d[0],x14
1570	mov	v17.d[1],x15
1571#ifdef HITLS_BIG_ENDIAN
1572	rev32	v17.16b,v17.16b
1573#endif
1574	mov	w7,0x87
1575	extr	x9,x13,x13,#32
1576	extr	x15,x13,x12,#63
1577	and	w8,w7,w9,asr#31
1578	eor	x14,x8,x12,lsl#1
1579	mov	v18.d[0],x16
1580	mov	v18.d[1],x17
1581#ifdef HITLS_BIG_ENDIAN
1582	rev32	v18.16b,v18.16b
1583#endif
1584	mov	w7,0x87
1585	extr	x9,x15,x15,#32
1586	extr	x17,x15,x14,#63
1587	and	w8,w7,w9,asr#31
1588	eor	x16,x8,x14,lsl#1
1589	mov	v19.d[0],x18
1590	mov	v19.d[1],x19
1591#ifdef HITLS_BIG_ENDIAN
1592	rev32	v19.16b,v19.16b
1593#endif
1594	mov	w7,0x87
1595	extr	x9,x17,x17,#32
1596	extr	x19,x17,x16,#63
1597	and	w8,w7,w9,asr#31
1598	eor	x18,x8,x16,lsl#1
1599	rbit	v16.16b,v16.16b
1600	rbit	v17.16b,v17.16b
1601	rbit	v18.16b,v18.16b
1602	rbit	v19.16b,v19.16b
1603	eor	v8.16b, v8.16b, v16.16b
1604	eor	v9.16b, v9.16b, v17.16b
1605	eor	v10.16b, v10.16b, v18.16b
1606	eor	v11.16b, v11.16b, v19.16b
1607	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64
1608	st1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64
1609	mov	v16.d[0],x12
1610	mov	v16.d[1],x13
1611#ifdef HITLS_BIG_ENDIAN
1612	rev32	v16.16b,v16.16b
1613#endif
1614	mov	w7,0x87
1615	extr	x9,x19,x19,#32
1616	extr	x13,x19,x18,#63
1617	and	w8,w7,w9,asr#31
1618	eor	x12,x8,x18,lsl#1
1619	mov	v17.d[0],x14
1620	mov	v17.d[1],x15
1621#ifdef HITLS_BIG_ENDIAN
1622	rev32	v17.16b,v17.16b
1623#endif
1624	mov	w7,0x87
1625	extr	x9,x13,x13,#32
1626	extr	x15,x13,x12,#63
1627	and	w8,w7,w9,asr#31
1628	eor	x14,x8,x12,lsl#1
1629	mov	v18.d[0],x16
1630	mov	v18.d[1],x17
1631#ifdef HITLS_BIG_ENDIAN
1632	rev32	v18.16b,v18.16b
1633#endif
1634	mov	w7,0x87
1635	extr	x9,x15,x15,#32
1636	extr	x17,x15,x14,#63
1637	and	w8,w7,w9,asr#31
1638	eor	x16,x8,x14,lsl#1
1639	mov	v19.d[0],x18
1640	mov	v19.d[1],x19
1641#ifdef HITLS_BIG_ENDIAN
1642	rev32	v19.16b,v19.16b
1643#endif
1644	mov	w7,0x87
1645	extr	x9,x17,x17,#32
1646	extr	x19,x17,x16,#63
1647	and	w8,w7,w9,asr#31
1648	eor	x18,x8,x16,lsl#1
1649	rbit	v16.16b,v16.16b
1650	rbit	v17.16b,v17.16b
1651	rbit	v18.16b,v18.16b
1652	rbit	v19.16b,v19.16b
1653	eor	v0.16b, v0.16b, v16.16b
1654	eor	v1.16b, v1.16b, v17.16b
1655	eor	v2.16b, v2.16b, v18.16b
1656	eor	v3.16b, v3.16b, v19.16b
1657	st1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64
1658	mov	v16.16b,v0.16b
1659	mov	v17.16b,v1.16b
1660	mov	v18.16b,v2.16b
1661	mov	v19.16b,v3.16b
1662#ifndef HITLS_BIG_ENDIAN
1663	rev32	v4.16b,v4.16b
1664	rev32	v5.16b,v5.16b
1665	rev32	v6.16b,v6.16b
1666	rev32	v7.16b,v7.16b
1667	rev32	v8.16b,v8.16b
1668	rev32	v9.16b,v9.16b
1669	rev32	v10.16b,v10.16b
1670	rev32	v11.16b,v11.16b
1671	rev32	v16.16b,v16.16b
1672	rev32	v17.16b,v17.16b
1673	rev32	v18.16b,v18.16b
1674	rev32	v19.16b,v19.16b
1675#endif
1676	zip1	v0.4s,v4.4s,v5.4s
1677	zip2	v1.4s,v4.4s,v5.4s
1678	zip1	v2.4s,v6.4s,v7.4s
1679	zip2	v3.4s,v6.4s,v7.4s
1680	zip1	v4.2d,v0.2d,v2.2d
1681	zip2	v5.2d,v0.2d,v2.2d
1682	zip1	v6.2d,v1.2d,v3.2d
1683	zip2	v7.2d,v1.2d,v3.2d
1684	zip1	v0.4s,v8.4s,v9.4s
1685	zip2	v1.4s,v8.4s,v9.4s
1686	zip1	v2.4s,v10.4s,v11.4s
1687	zip2	v3.4s,v10.4s,v11.4s
1688	zip1	v8.2d,v0.2d,v2.2d
1689	zip2	v9.2d,v0.2d,v2.2d
1690	zip1	v10.2d,v1.2d,v3.2d
1691	zip2	v11.2d,v1.2d,v3.2d
1692	zip1	v0.4s,v16.4s,v17.4s
1693	zip2	v1.4s,v16.4s,v17.4s
1694	zip1	v2.4s,v18.4s,v19.4s
1695	zip2	v3.4s,v18.4s,v19.4s
1696	zip1	v16.2d,v0.2d,v2.2d
1697	zip2	v17.2d,v0.2d,v2.2d
1698	zip1	v18.2d,v1.2d,v3.2d
1699	zip2	v19.2d,v1.2d,v3.2d
1700	bl	Sm4Enc12blks
1701	zip1	v16.4s,v0.4s,v1.4s
1702	zip2	v17.4s,v0.4s,v1.4s
1703	zip1	v18.4s,v2.4s,v3.4s
1704	zip2	v19.4s,v2.4s,v3.4s
1705	zip1	v0.2d,v16.2d,v18.2d
1706	zip2	v1.2d,v16.2d,v18.2d
1707	zip1	v2.2d,v17.2d,v19.2d
1708	zip2	v3.2d,v17.2d,v19.2d
1709	zip1	v16.4s,v4.4s,v5.4s
1710	zip2	v17.4s,v4.4s,v5.4s
1711	zip1	v18.4s,v6.4s,v7.4s
1712	zip2	v19.4s,v6.4s,v7.4s
1713	zip1	v4.2d,v16.2d,v18.2d
1714	zip2	v5.2d,v16.2d,v18.2d
1715	zip1	v6.2d,v17.2d,v19.2d
1716	zip2	v7.2d,v17.2d,v19.2d
1717	zip1	v16.4s,v8.4s,v9.4s
1718	zip2	v17.4s,v8.4s,v9.4s
1719	zip1	v18.4s,v10.4s,v11.4s
1720	zip2	v19.4s,v10.4s,v11.4s
1721	zip1	v8.2d,v16.2d,v18.2d
1722	zip2	v9.2d,v16.2d,v18.2d
1723	zip1	v10.2d,v17.2d,v19.2d
1724	zip2	v11.2d,v17.2d,v19.2d
1725	mov	x24, sp
1726	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64
1727	eor	v0.16b, v0.16b, v16.16b
1728	eor	v1.16b, v1.16b, v17.16b
1729	eor	v2.16b, v2.16b, v18.16b
1730	eor	v3.16b, v3.16b, v19.16b
1731
1732	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64
1733	eor	v4.16b, v4.16b, v16.16b
1734	eor	v5.16b, v5.16b, v17.16b
1735	eor	v6.16b, v6.16b, v18.16b
1736	eor	v7.16b, v7.16b, v19.16b
1737
1738	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64
1739	eor	v8.16b, v8.16b, v16.16b
1740	eor	v9.16b, v9.16b, v17.16b
1741	eor	v10.16b, v10.16b, v18.16b
1742	eor	v11.16b, v11.16b, v19.16b
1743
1744	// save the last tweak
1745	mov	v24.16b,v19.16b
1746	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
1747	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1748	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[x1],#64
1749	subs	x2,x2,#12
1750	b.gt	.Lxts_12_blocks_process
1751	b	100f
1752.Lxts_8_blocks_process:
1753	mov	x24, sp
1754	cmp	x2,#8
1755	mov	v16.d[0],x12
1756	mov	v16.d[1],x13
1757#ifdef HITLS_BIG_ENDIAN
1758	rev32	v16.16b,v16.16b
1759#endif
1760	mov	w7,0x87
1761	extr	x9,x19,x19,#32
1762	extr	x13,x19,x18,#63
1763	and		w8,w7,w9,asr#31
1764	eor	x12,x8,x18,lsl#1
1765	mov	v17.d[0],x14
1766	mov	v17.d[1],x15
1767#ifdef HITLS_BIG_ENDIAN
1768	rev32	v17.16b,v17.16b
1769#endif
1770	mov	w7,0x87
1771	extr	x9,x13,x13,#32
1772	extr	x15,x13,x12,#63
1773	and	w8,w7,w9,asr#31
1774	eor	x14,x8,x12,lsl#1
1775	mov	v18.d[0],x16
1776	mov	v18.d[1],x17
1777#ifdef HITLS_BIG_ENDIAN
1778	rev32	v18.16b,v18.16b
1779#endif
1780	mov	w7,0x87
1781	extr	x9,x15,x15,#32
1782	extr	x17,x15,x14,#63
1783	and	w8,w7,w9,asr#31
1784	eor	x16,x8,x14,lsl#1
1785	mov	v19.d[0],x18
1786	mov	v19.d[1],x19
1787#ifdef HITLS_BIG_ENDIAN
1788	rev32	v19.16b,v19.16b
1789#endif
1790	mov	w7,0x87
1791	extr	x9,x17,x17,#32
1792	extr	x19,x17,x16,#63
1793	and	w8,w7,w9,asr#31
1794	eor	x18,x8,x16,lsl#1
1795	b.lt	.Lxts_4_blocks_process
1796	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1797	rbit	v16.16b,v16.16b
1798	rbit	v17.16b,v17.16b
1799	rbit	v18.16b,v18.16b
1800	rbit	v19.16b,v19.16b
1801	eor	v4.16b, v4.16b, v16.16b
1802	eor	v5.16b, v5.16b, v17.16b
1803	eor	v6.16b, v6.16b, v18.16b
1804	eor	v7.16b, v7.16b, v19.16b
1805	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
1806	st1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24], #64
1807	mov	v16.d[0],x12
1808	mov	v16.d[1],x13
1809#ifdef HITLS_BIG_ENDIAN
1810	rev32	v16.16b,v16.16b
1811#endif
1812	mov	w7,0x87
1813	extr	x9,x19,x19,#32
1814	extr	x13,x19,x18,#63
1815	and	w8,w7,w9,asr#31
1816	eor	x12,x8,x18,lsl#1
1817	mov	v17.d[0],x14
1818	mov	v17.d[1],x15
1819#ifdef HITLS_BIG_ENDIAN
1820	rev32	v17.16b,v17.16b
1821#endif
1822	mov	w7,0x87
1823	extr	x9,x13,x13,#32
1824	extr	x15,x13,x12,#63
1825	and	w8,w7,w9,asr#31
1826	eor	x14,x8,x12,lsl#1
1827	mov	v18.d[0],x16
1828	mov	v18.d[1],x17
1829#ifdef HITLS_BIG_ENDIAN
1830	rev32	v18.16b,v18.16b
1831#endif
1832	mov	w7,0x87
1833	extr	x9,x15,x15,#32
1834	extr	x17,x15,x14,#63
1835	and	w8,w7,w9,asr#31
1836	eor	x16,x8,x14,lsl#1
1837	mov	v19.d[0],x18
1838	mov	v19.d[1],x19
1839#ifdef HITLS_BIG_ENDIAN
1840	rev32	v19.16b,v19.16b
1841#endif
1842	mov	w7,0x87
1843	extr	x9,x17,x17,#32
1844	extr	x19,x17,x16,#63
1845	and	w8,w7,w9,asr#31
1846	eor	x18,x8,x16,lsl#1
1847	rbit	v16.16b,v16.16b
1848	rbit	v17.16b,v17.16b
1849	rbit	v18.16b,v18.16b
1850	rbit	v19.16b,v19.16b
1851	eor	v8.16b, v8.16b, v16.16b
1852	eor	v9.16b, v9.16b, v17.16b
1853	eor	v10.16b, v10.16b, v18.16b
1854	eor	v11.16b, v11.16b, v19.16b
1855	st1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64
1856#ifndef HITLS_BIG_ENDIAN
1857	rev32	v4.16b,v4.16b
1858	rev32	v5.16b,v5.16b
1859	rev32	v6.16b,v6.16b
1860	rev32	v7.16b,v7.16b
1861	rev32	v8.16b,v8.16b
1862	rev32	v9.16b,v9.16b
1863	rev32	v10.16b,v10.16b
1864	rev32	v11.16b,v11.16b
1865#endif
1866	zip1	v0.4s,v4.4s,v5.4s
1867	zip2	v1.4s,v4.4s,v5.4s
1868	zip1	v2.4s,v6.4s,v7.4s
1869	zip2	v3.4s,v6.4s,v7.4s
1870	zip1	v4.2d,v0.2d,v2.2d
1871	zip2	v5.2d,v0.2d,v2.2d
1872	zip1	v6.2d,v1.2d,v3.2d
1873	zip2	v7.2d,v1.2d,v3.2d
1874	zip1	v0.4s,v8.4s,v9.4s
1875	zip2	v1.4s,v8.4s,v9.4s
1876	zip1	v2.4s,v10.4s,v11.4s
1877	zip2	v3.4s,v10.4s,v11.4s
1878	zip1	v8.2d,v0.2d,v2.2d
1879	zip2	v9.2d,v0.2d,v2.2d
1880	zip1	v10.2d,v1.2d,v3.2d
1881	zip2	v11.2d,v1.2d,v3.2d
1882	bl	Sm4Enc8blks
1883	zip1	v8.4s,v0.4s,v1.4s
1884	zip2	v9.4s,v0.4s,v1.4s
1885	zip1	v10.4s,v2.4s,v3.4s
1886	zip2	v11.4s,v2.4s,v3.4s
1887	zip1	v0.2d,v8.2d,v10.2d
1888	zip2	v1.2d,v8.2d,v10.2d
1889	zip1	v2.2d,v9.2d,v11.2d
1890	zip2	v3.2d,v9.2d,v11.2d
1891	zip1	v8.4s,v4.4s,v5.4s
1892	zip2	v9.4s,v4.4s,v5.4s
1893	zip1	v10.4s,v6.4s,v7.4s
1894	zip2	v11.4s,v6.4s,v7.4s
1895	zip1	v4.2d,v8.2d,v10.2d
1896	zip2	v5.2d,v8.2d,v10.2d
1897	zip1	v6.2d,v9.2d,v11.2d
1898	zip2	v7.2d,v9.2d,v11.2d
1899	mov	x24, sp
1900	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64
1901	eor	v0.16b, v0.16b, v16.16b
1902	eor	v1.16b, v1.16b, v17.16b
1903	eor	v2.16b, v2.16b, v18.16b
1904	eor	v3.16b, v3.16b, v19.16b
1905
1906	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64
1907	eor	v4.16b, v4.16b, v16.16b
1908	eor	v5.16b, v5.16b, v17.16b
1909	eor	v6.16b, v6.16b, v18.16b
1910	eor	v7.16b, v7.16b, v19.16b
1911
1912	// save the last tweak
1913	mov	v24.16b,v19.16b
1914	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
1915	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1916	subs	x2,x2,#8
1917	b.gt	.Lxts_8_blocks_process
1918	b	100f
1919.Lxts_4_blocks_process:
1920	cmp	x2,#4
1921	b.lt	1f
1922	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1923	rbit	v16.16b,v16.16b
1924	rbit	v17.16b,v17.16b
1925	rbit	v18.16b,v18.16b
1926	rbit	v19.16b,v19.16b
1927	eor	v4.16b, v4.16b, v16.16b
1928	eor	v5.16b, v5.16b, v17.16b
1929	eor	v6.16b, v6.16b, v18.16b
1930	eor	v7.16b, v7.16b, v19.16b
1931#ifndef HITLS_BIG_ENDIAN
1932	rev32	v4.16b,v4.16b
1933	rev32	v5.16b,v5.16b
1934	rev32	v6.16b,v6.16b
1935	rev32	v7.16b,v7.16b
1936#endif
1937	zip1	v0.4s,v4.4s,v5.4s
1938	zip2	v1.4s,v4.4s,v5.4s
1939	zip1	v2.4s,v6.4s,v7.4s
1940	zip2	v3.4s,v6.4s,v7.4s
1941	zip1	v4.2d,v0.2d,v2.2d
1942	zip2	v5.2d,v0.2d,v2.2d
1943	zip1	v6.2d,v1.2d,v3.2d
1944	zip2	v7.2d,v1.2d,v3.2d
1945	bl	Sm4Enc4blks
1946	zip1	v4.4s,v0.4s,v1.4s
1947	zip2	v5.4s,v0.4s,v1.4s
1948	zip1	v6.4s,v2.4s,v3.4s
1949	zip2	v7.4s,v2.4s,v3.4s
1950	zip1	v0.2d,v4.2d,v6.2d
1951	zip2	v1.2d,v4.2d,v6.2d
1952	zip1	v2.2d,v5.2d,v7.2d
1953	zip2	v3.2d,v5.2d,v7.2d
1954	eor	v0.16b, v0.16b, v16.16b
1955	eor	v1.16b, v1.16b, v17.16b
1956	eor	v2.16b, v2.16b, v18.16b
1957	eor	v3.16b, v3.16b, v19.16b
1958	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
1959	sub	x2,x2,#4
1960	// save the last tweak
1961	mov	v24.16b,v19.16b
1962	mov	v16.d[0],x12
1963	mov	v16.d[1],x13
1964#ifdef HITLS_BIG_ENDIAN
1965	rev32	v16.16b,v16.16b
1966#endif
1967	mov	w7,0x87
1968	extr	x9,x19,x19,#32
1969	extr	x13,x19,x18,#63
1970	and	w8,w7,w9,asr#31
1971	eor	x12,x8,x18,lsl#1
1972	mov	v17.d[0],x14
1973	mov	v17.d[1],x15
1974#ifdef HITLS_BIG_ENDIAN
1975	rev32	v17.16b,v17.16b
1976#endif
1977	mov	w7,0x87
1978	extr	x9,x13,x13,#32
1979	extr	x15,x13,x12,#63
1980	and	w8,w7,w9,asr#31
1981	eor	x14,x8,x12,lsl#1
1982	mov	v18.d[0],x16
1983	mov	v18.d[1],x17
1984#ifdef HITLS_BIG_ENDIAN
1985	rev32	v18.16b,v18.16b
1986#endif
1987	mov	w7,0x87
1988	extr	x9,x15,x15,#32
1989	extr	x17,x15,x14,#63
1990	and	w8,w7,w9,asr#31
1991	eor	x16,x8,x14,lsl#1
1992	mov	v19.d[0],x18
1993	mov	v19.d[1],x19
1994#ifdef HITLS_BIG_ENDIAN
1995	rev32	v19.16b,v19.16b
1996#endif
1997	mov	w7,0x87
1998	extr	x9,x17,x17,#32
1999	extr	x19,x17,x16,#63
2000	and	w8,w7,w9,asr#31
2001	eor	x18,x8,x16,lsl#1
20021:
2003	// process last block
2004	cmp	x2,#1
2005	b.lt	100f
2006	b.gt	1f
2007	ld1	{v4.4s},[x0],#16
2008	rbit	v16.16b,v16.16b
2009	eor	v4.16b, v4.16b, v16.16b
2010#ifndef HITLS_BIG_ENDIAN
2011	rev32	v4.16b,v4.16b
2012#endif
2013	mov	x10,x3
2014	mov	w11,#8
2015	mov	w12,v4.s[0]
2016	mov	w13,v4.s[1]
2017	mov	w14,v4.s[2]
2018	mov	w15,v4.s[3]
201910:
2020	ldp	w7,w8,[x10],8
2021	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2022	eor	w6,w14,w15
2023	eor	w9,w7,w13
2024	eor	w6,w6,w9
2025	movi	v31.16b, #0x0f
2026	mov	v3.s[0],w6
2027	// optimize sbox using AESE instruction
2028	tbl	v0.16b, {v3.16b}, v26.16b
2029	ushr	v2.16b, v0.16b, 4
2030	and	v0.16b, v0.16b, v31.16b
2031	tbl	v0.16b, {v28.16b}, v0.16b
2032	tbl	v2.16b, {v27.16b}, v2.16b
2033	eor	v0.16b, v0.16b, v2.16b
2034	eor	v1.16b, v1.16b, v1.16b
2035	aese	v0.16b,v1.16b
2036	ushr	v2.16b, v0.16b, 4
2037	and	v0.16b, v0.16b, v31.16b
2038	tbl	v0.16b, {v30.16b}, v0.16b
2039	tbl	v2.16b, {v29.16b}, v2.16b
2040	eor	v0.16b, v0.16b, v2.16b
2041
2042	mov	w7,v0.s[0]
2043	eor	w6,w7,w7,ror #32-2
2044	eor	w6,w6,w7,ror #32-10
2045	eor	w6,w6,w7,ror #32-18
2046	eor	w6,w6,w7,ror #32-24
2047	eor	w12,w12,w6
2048	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2049	eor	w6,w14,w15
2050	eor	w9,w12,w8
2051	eor	w6,w6,w9
2052	movi	v31.16b, #0x0f
2053	mov	v3.s[0],w6
2054	// optimize sbox using AESE instruction
2055	tbl	v0.16b, {v3.16b}, v26.16b
2056	ushr	v2.16b, v0.16b, 4
2057	and	v0.16b, v0.16b, v31.16b
2058	tbl	v0.16b, {v28.16b}, v0.16b
2059	tbl	v2.16b, {v27.16b}, v2.16b
2060	eor	v0.16b, v0.16b, v2.16b
2061	eor	v1.16b, v1.16b, v1.16b
2062	aese	v0.16b,v1.16b
2063	ushr	v2.16b, v0.16b, 4
2064	and	v0.16b, v0.16b, v31.16b
2065	tbl	v0.16b, {v30.16b}, v0.16b
2066	tbl	v2.16b, {v29.16b}, v2.16b
2067	eor	v0.16b, v0.16b, v2.16b
2068
2069	mov	w7,v0.s[0]
2070	eor	w6,w7,w7,ror #32-2
2071	eor	w6,w6,w7,ror #32-10
2072	eor	w6,w6,w7,ror #32-18
2073	eor	w6,w6,w7,ror #32-24
2074	ldp	w7,w8,[x10],8
2075	eor	w13,w13,w6
2076	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2077	eor	w6,w12,w13
2078	eor	w9,w7,w15
2079	eor	w6,w6,w9
2080	movi	v31.16b, #0x0f
2081	mov	v3.s[0],w6
2082	// optimize sbox using AESE instruction
2083	tbl	v0.16b, {v3.16b}, v26.16b
2084	ushr	v2.16b, v0.16b, 4
2085	and	v0.16b, v0.16b, v31.16b
2086	tbl	v0.16b, {v28.16b}, v0.16b
2087	tbl	v2.16b, {v27.16b}, v2.16b
2088	eor	v0.16b, v0.16b, v2.16b
2089	eor	v1.16b, v1.16b, v1.16b
2090	aese	v0.16b,v1.16b
2091	ushr	v2.16b, v0.16b, 4
2092	and	v0.16b, v0.16b, v31.16b
2093	tbl	v0.16b, {v30.16b}, v0.16b
2094	tbl	v2.16b, {v29.16b}, v2.16b
2095	eor	v0.16b, v0.16b, v2.16b
2096
2097	mov	w7,v0.s[0]
2098	eor	w6,w7,w7,ror #32-2
2099	eor	w6,w6,w7,ror #32-10
2100	eor	w6,w6,w7,ror #32-18
2101	eor	w6,w6,w7,ror #32-24
2102	eor	w14,w14,w6
2103	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2104	eor	w6,w12,w13
2105	eor	w9,w14,w8
2106	eor	w6,w6,w9
2107	movi	v31.16b, #0x0f
2108	mov	v3.s[0],w6
2109	// optimize sbox using AESE instruction
2110	tbl	v0.16b, {v3.16b}, v26.16b
2111	ushr	v2.16b, v0.16b, 4
2112	and	v0.16b, v0.16b, v31.16b
2113	tbl	v0.16b, {v28.16b}, v0.16b
2114	tbl	v2.16b, {v27.16b}, v2.16b
2115	eor	v0.16b, v0.16b, v2.16b
2116	eor	v1.16b, v1.16b, v1.16b
2117	aese	v0.16b,v1.16b
2118	ushr	v2.16b, v0.16b, 4
2119	and	v0.16b, v0.16b, v31.16b
2120	tbl	v0.16b, {v30.16b}, v0.16b
2121	tbl	v2.16b, {v29.16b}, v2.16b
2122	eor	v0.16b, v0.16b, v2.16b
2123
2124	mov	w7,v0.s[0]
2125	eor	w6,w7,w7,ror #32-2
2126	eor	w6,w6,w7,ror #32-10
2127	eor	w6,w6,w7,ror #32-18
2128	eor	w6,w6,w7,ror #32-24
2129	eor	w15,w15,w6
2130	subs	w11,w11,#1
2131	b.ne	10b
2132	mov	v4.s[0],w15
2133	mov	v4.s[1],w14
2134	mov	v4.s[2],w13
2135	mov	v4.s[3],w12
2136#ifndef HITLS_BIG_ENDIAN
2137	rev32	v4.16b,v4.16b
2138#endif
2139	eor	v4.16b, v4.16b, v16.16b
2140	st1	{v4.4s},[x1],#16
2141	// save the last tweak
2142	mov	v24.16b,v16.16b
2143	b	100f
21441:	//	process last 2 blocks
2145	cmp	x2,#2
2146	b.gt	1f
2147	ld1	{v4.4s,v5.4s},[x0],#32
2148	rbit	v16.16b,v16.16b
2149	rbit	v17.16b,v17.16b
2150	eor	v4.16b, v4.16b, v16.16b
2151	eor	v5.16b, v5.16b, v17.16b
2152#ifndef HITLS_BIG_ENDIAN
2153	rev32	v4.16b,v4.16b
2154	rev32	v5.16b,v5.16b
2155#endif
2156	zip1	v0.4s,v4.4s,v5.4s
2157	zip2	v1.4s,v4.4s,v5.4s
2158	zip1	v2.4s,v6.4s,v7.4s
2159	zip2	v3.4s,v6.4s,v7.4s
2160	zip1	v4.2d,v0.2d,v2.2d
2161	zip2	v5.2d,v0.2d,v2.2d
2162	zip1	v6.2d,v1.2d,v3.2d
2163	zip2	v7.2d,v1.2d,v3.2d
2164	bl	Sm4Enc4blks
2165	zip1	v4.4s,v0.4s,v1.4s
2166	zip2	v5.4s,v0.4s,v1.4s
2167	zip1	v6.4s,v2.4s,v3.4s
2168	zip2	v7.4s,v2.4s,v3.4s
2169	zip1	v0.2d,v4.2d,v6.2d
2170	zip2	v1.2d,v4.2d,v6.2d
2171	zip1	v2.2d,v5.2d,v7.2d
2172	zip2	v3.2d,v5.2d,v7.2d
2173	eor	v0.16b, v0.16b, v16.16b
2174	eor	v1.16b, v1.16b, v17.16b
2175	st1	{v0.4s,v1.4s},[x1],#32
2176	// save the last tweak
2177	mov	v24.16b,v17.16b
2178	b	100f
21791:	//	process last 3 blocks
2180	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
2181	rbit	v16.16b,v16.16b
2182	rbit	v17.16b,v17.16b
2183	rbit	v18.16b,v18.16b
2184	eor	v4.16b, v4.16b, v16.16b
2185	eor	v5.16b, v5.16b, v17.16b
2186	eor	v6.16b, v6.16b, v18.16b
2187#ifndef HITLS_BIG_ENDIAN
2188	rev32	v4.16b,v4.16b
2189	rev32	v5.16b,v5.16b
2190	rev32	v6.16b,v6.16b
2191#endif
2192	zip1	v0.4s,v4.4s,v5.4s
2193	zip2	v1.4s,v4.4s,v5.4s
2194	zip1	v2.4s,v6.4s,v7.4s
2195	zip2	v3.4s,v6.4s,v7.4s
2196	zip1	v4.2d,v0.2d,v2.2d
2197	zip2	v5.2d,v0.2d,v2.2d
2198	zip1	v6.2d,v1.2d,v3.2d
2199	zip2	v7.2d,v1.2d,v3.2d
2200	bl	Sm4Enc4blks
2201	zip1	v4.4s,v0.4s,v1.4s
2202	zip2	v5.4s,v0.4s,v1.4s
2203	zip1	v6.4s,v2.4s,v3.4s
2204	zip2	v7.4s,v2.4s,v3.4s
2205	zip1	v0.2d,v4.2d,v6.2d
2206	zip2	v1.2d,v4.2d,v6.2d
2207	zip1	v2.2d,v5.2d,v7.2d
2208	zip2	v3.2d,v5.2d,v7.2d
2209	eor	v0.16b, v0.16b, v16.16b
2210	eor	v1.16b, v1.16b, v17.16b
2211	eor	v2.16b, v2.16b, v18.16b
2212	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
2213	// save the last tweak
2214	mov	v24.16b,v18.16b
2215100:
2216	cmp	x29,0
2217	b.eq	.Lxts_cipher_return
2218
2219// This branch calculates the last two tweaks,
2220// while the encryption/decryption length is larger than 32
2221.last_2blks_tweak:
2222#ifdef HITLS_BIG_ENDIAN
2223	rev32	v24.16b,v24.16b
2224#endif
2225	rbit	v2.16b,v24.16b
2226	adrp    x26, .Lxts_magic
2227    add	    x26, x26, :lo12:.Lxts_magic
2228	ldr	    q0, [x26]
2229	shl	v17.16b, v2.16b, #1
2230	ext	v1.16b, v2.16b, v2.16b,#15
2231	ushr	v1.16b, v1.16b, #7
2232	mul	v1.16b, v1.16b, v0.16b
2233	eor	v17.16b, v17.16b, v1.16b
2234	rbit	v17.16b,v17.16b
2235	rbit	v2.16b,v17.16b
2236	adrp    x26, .Lxts_magic
2237    add	    x26, x26, :lo12:.Lxts_magic
2238	ldr	    q0, [x26]
2239	shl	v18.16b, v2.16b, #1
2240	ext	v1.16b, v2.16b, v2.16b,#15
2241	ushr	v1.16b, v1.16b, #7
2242	mul	v1.16b, v1.16b, v0.16b
2243	eor	v18.16b, v18.16b, v1.16b
2244	rbit	v18.16b,v18.16b
2245	b	.Lxts_check_dec
2246
2247
2248// This branch calculates the last two tweaks,
2249// while the encryption/decryption length is equal to 32, who only need two tweaks
2250.only_2blks_tweak:
2251	mov	v17.16b,v16.16b
2252#ifdef HITLS_BIG_ENDIAN
2253	rev32	v17.16b,v17.16b
2254#endif
2255	rbit	v2.16b,v17.16b
2256	adrp    x26, .Lxts_magic
2257    add	    x26, x26, :lo12:.Lxts_magic
2258	ldr	q0, [x26]
2259	shl	v18.16b, v2.16b, #1
2260	ext	v1.16b, v2.16b, v2.16b,#15
2261	ushr	v1.16b, v1.16b, #7
2262	mul	v1.16b, v1.16b, v0.16b
2263	eor	v18.16b, v18.16b, v1.16b
2264	rbit	v18.16b,v18.16b
2265	b	.Lxts_check_dec
2266
2267
2268// Determine whether encryption or decryption is required.
2269// The last two tweaks need to be swapped for decryption.
2270.Lxts_check_dec:
2271	// encryption:1 decryption:0
2272	cmp	w28,1
2273	b.eq	.Lxts_prcess_last_2blks
2274	mov	v0.16B,v17.16b
2275	mov	v17.16B,v18.16b
2276	mov	v18.16B,v0.16b
2277
2278.Lxts_prcess_last_2blks:
2279#ifdef HITLS_BIG_ENDIAN
2280	rev32	v17.16b,v17.16b
2281	rev32	v18.16b,v18.16b
2282#endif
2283	ld1	{v4.4s},[x0],#16
2284	eor	v4.16b, v4.16b, v17.16b
2285#ifndef HITLS_BIG_ENDIAN
2286	rev32	v4.16b,v4.16b
2287#endif
2288	mov	x10,x3
2289	mov	w11,#8
2290	mov	w12,v4.s[0]
2291	mov	w13,v4.s[1]
2292	mov	w14,v4.s[2]
2293	mov	w15,v4.s[3]
229410:
2295	ldp	w7,w8,[x10],8
2296	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2297	eor	w6,w14,w15
2298	eor	w9,w7,w13
2299	eor	w6,w6,w9
2300	movi	v31.16b, #0x0f
2301	mov	v3.s[0],w6
2302	// optimize sbox using AESE instruction
2303	tbl	v0.16b, {v3.16b}, v26.16b
2304	ushr	v2.16b, v0.16b, 4
2305	and	v0.16b, v0.16b, v31.16b
2306	tbl	v0.16b, {v28.16b}, v0.16b
2307	tbl	v2.16b, {v27.16b}, v2.16b
2308	eor	v0.16b, v0.16b, v2.16b
2309	eor	v1.16b, v1.16b, v1.16b
2310	aese	v0.16b,v1.16b
2311	ushr	v2.16b, v0.16b, 4
2312	and	v0.16b, v0.16b, v31.16b
2313	tbl	v0.16b, {v30.16b}, v0.16b
2314	tbl	v2.16b, {v29.16b}, v2.16b
2315	eor	v0.16b, v0.16b, v2.16b
2316
2317	mov	w7,v0.s[0]
2318	eor	w6,w7,w7,ror #32-2
2319	eor	w6,w6,w7,ror #32-10
2320	eor	w6,w6,w7,ror #32-18
2321	eor	w6,w6,w7,ror #32-24
2322	eor	w12,w12,w6
2323	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2324	eor	w6,w14,w15
2325	eor	w9,w12,w8
2326	eor	w6,w6,w9
2327	movi	v31.16b, #0x0f
2328	mov	v3.s[0],w6
2329	// optimize sbox using AESE instruction
2330	tbl	v0.16b, {v3.16b}, v26.16b
2331	ushr	v2.16b, v0.16b, 4
2332	and	v0.16b, v0.16b, v31.16b
2333	tbl	v0.16b, {v28.16b}, v0.16b
2334	tbl	v2.16b, {v27.16b}, v2.16b
2335	eor	v0.16b, v0.16b, v2.16b
2336	eor	v1.16b, v1.16b, v1.16b
2337	aese	v0.16b,v1.16b
2338	ushr	v2.16b, v0.16b, 4
2339	and	v0.16b, v0.16b, v31.16b
2340	tbl	v0.16b, {v30.16b}, v0.16b
2341	tbl	v2.16b, {v29.16b}, v2.16b
2342	eor	v0.16b, v0.16b, v2.16b
2343
2344	mov	w7,v0.s[0]
2345	eor	w6,w7,w7,ror #32-2
2346	eor	w6,w6,w7,ror #32-10
2347	eor	w6,w6,w7,ror #32-18
2348	eor	w6,w6,w7,ror #32-24
2349	ldp	w7,w8,[x10],8
2350	eor	w13,w13,w6
2351	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2352	eor	w6,w12,w13
2353	eor	w9,w7,w15
2354	eor	w6,w6,w9
2355	movi	v31.16b, #0x0f
2356	mov	v3.s[0],w6
2357	// optimize sbox using AESE instruction
2358	tbl	v0.16b, {v3.16b}, v26.16b
2359	ushr	v2.16b, v0.16b, 4
2360	and	v0.16b, v0.16b, v31.16b
2361	tbl	v0.16b, {v28.16b}, v0.16b
2362	tbl	v2.16b, {v27.16b}, v2.16b
2363	eor	v0.16b, v0.16b, v2.16b
2364	eor	v1.16b, v1.16b, v1.16b
2365	aese	v0.16b,v1.16b
2366	ushr	v2.16b, v0.16b, 4
2367	and	v0.16b, v0.16b, v31.16b
2368	tbl	v0.16b, {v30.16b}, v0.16b
2369	tbl	v2.16b, {v29.16b}, v2.16b
2370	eor	v0.16b, v0.16b, v2.16b
2371
2372	mov	w7,v0.s[0]
2373	eor	w6,w7,w7,ror #32-2
2374	eor	w6,w6,w7,ror #32-10
2375	eor	w6,w6,w7,ror #32-18
2376	eor	w6,w6,w7,ror #32-24
2377	eor	w14,w14,w6
2378	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2379	eor	w6,w12,w13
2380	eor	w9,w14,w8
2381	eor	w6,w6,w9
2382	movi	v31.16b, #0x0f
2383	mov	v3.s[0],w6
2384	// optimize sbox using AESE instruction
2385	tbl	v0.16b, {v3.16b}, v26.16b
2386	ushr	v2.16b, v0.16b, 4
2387	and	v0.16b, v0.16b, v31.16b
2388	tbl	v0.16b, {v28.16b}, v0.16b
2389	tbl	v2.16b, {v27.16b}, v2.16b
2390	eor	v0.16b, v0.16b, v2.16b
2391	eor	v1.16b, v1.16b, v1.16b
2392	aese	v0.16b,v1.16b
2393	ushr	v2.16b, v0.16b, 4
2394	and	v0.16b, v0.16b, v31.16b
2395	tbl	v0.16b, {v30.16b}, v0.16b
2396	tbl	v2.16b, {v29.16b}, v2.16b
2397	eor	v0.16b, v0.16b, v2.16b
2398
2399	mov	w7,v0.s[0]
2400	eor	w6,w7,w7,ror #32-2
2401	eor	w6,w6,w7,ror #32-10
2402	eor	w6,w6,w7,ror #32-18
2403	eor	w6,w6,w7,ror #32-24
2404	eor	w15,w15,w6
2405	subs	w11,w11,#1
2406	b.ne	10b
2407	mov	v4.s[0],w15
2408	mov	v4.s[1],w14
2409	mov	v4.s[2],w13
2410	mov	v4.s[3],w12
2411#ifndef HITLS_BIG_ENDIAN
2412	rev32	v4.16b,v4.16b
2413#endif
2414	eor	v4.16b, v4.16b, v17.16b
2415	st1	{v4.4s},[x1],#16
2416
2417	sub	x26,x1,16
2418.Lxts_loop:
2419	subs	x29,x29,1
2420	ldrb	w7,[x26,x29]
2421	ldrb	w8,[x0,x29]
2422	strb	w8,[x26,x29]
2423	strb	w7,[x1,x29]
2424	b.gt	.Lxts_loop
2425	ld1	{v4.4s}, [x26]
2426	eor	v4.16b, v4.16b, v18.16b
2427#ifndef HITLS_BIG_ENDIAN
2428	rev32	v4.16b,v4.16b
2429#endif
2430	mov	x10,x3
2431	mov	w11,#8
2432	mov	w12,v4.s[0]
2433	mov	w13,v4.s[1]
2434	mov	w14,v4.s[2]
2435	mov	w15,v4.s[3]
243610:
2437	ldp	w7,w8,[x10],8
2438	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2439	eor	w6,w14,w15
2440	eor	w9,w7,w13
2441	eor	w6,w6,w9
2442	movi	v31.16b, #0x0f
2443	mov	v3.s[0],w6
2444	// optimize sbox using AESE instruction
2445	tbl	v0.16b, {v3.16b}, v26.16b
2446	ushr	v2.16b, v0.16b, 4
2447	and	v0.16b, v0.16b, v31.16b
2448	tbl	v0.16b, {v28.16b}, v0.16b
2449	tbl	v2.16b, {v27.16b}, v2.16b
2450	eor	v0.16b, v0.16b, v2.16b
2451	eor	v1.16b, v1.16b, v1.16b
2452	aese	v0.16b,v1.16b
2453	ushr	v2.16b, v0.16b, 4
2454	and	v0.16b, v0.16b, v31.16b
2455	tbl	v0.16b, {v30.16b}, v0.16b
2456	tbl	v2.16b, {v29.16b}, v2.16b
2457	eor	v0.16b, v0.16b, v2.16b
2458
2459	mov	w7,v0.s[0]
2460	eor	w6,w7,w7,ror #32-2
2461	eor	w6,w6,w7,ror #32-10
2462	eor	w6,w6,w7,ror #32-18
2463	eor	w6,w6,w7,ror #32-24
2464	eor	w12,w12,w6
2465	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2466	eor	w6,w14,w15
2467	eor	w9,w12,w8
2468	eor	w6,w6,w9
2469	movi	v31.16b, #0x0f
2470	mov	v3.s[0],w6
2471	// optimize sbox using AESE instruction
2472	tbl	v0.16b, {v3.16b}, v26.16b
2473	ushr	v2.16b, v0.16b, 4
2474	and	v0.16b, v0.16b, v31.16b
2475	tbl	v0.16b, {v28.16b}, v0.16b
2476	tbl	v2.16b, {v27.16b}, v2.16b
2477	eor	v0.16b, v0.16b, v2.16b
2478	eor	v1.16b, v1.16b, v1.16b
2479	aese	v0.16b,v1.16b
2480	ushr	v2.16b, v0.16b, 4
2481	and	v0.16b, v0.16b, v31.16b
2482	tbl	v0.16b, {v30.16b}, v0.16b
2483	tbl	v2.16b, {v29.16b}, v2.16b
2484	eor	v0.16b, v0.16b, v2.16b
2485
2486	mov	w7,v0.s[0]
2487	eor	w6,w7,w7,ror #32-2
2488	eor	w6,w6,w7,ror #32-10
2489	eor	w6,w6,w7,ror #32-18
2490	eor	w6,w6,w7,ror #32-24
2491	ldp	w7,w8,[x10],8
2492	eor	w13,w13,w6
2493	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2494	eor	w6,w12,w13
2495	eor	w9,w7,w15
2496	eor	w6,w6,w9
2497	movi	v31.16b, #0x0f
2498	mov	v3.s[0],w6
2499	// optimize sbox using AESE instruction
2500	tbl	v0.16b, {v3.16b}, v26.16b
2501	ushr	v2.16b, v0.16b, 4
2502	and	v0.16b, v0.16b, v31.16b
2503	tbl	v0.16b, {v28.16b}, v0.16b
2504	tbl	v2.16b, {v27.16b}, v2.16b
2505	eor	v0.16b, v0.16b, v2.16b
2506	eor	v1.16b, v1.16b, v1.16b
2507	aese	v0.16b,v1.16b
2508	ushr	v2.16b, v0.16b, 4
2509	and	v0.16b, v0.16b, v31.16b
2510	tbl	v0.16b, {v30.16b}, v0.16b
2511	tbl	v2.16b, {v29.16b}, v2.16b
2512	eor	v0.16b, v0.16b, v2.16b
2513
2514	mov	w7,v0.s[0]
2515	eor	w6,w7,w7,ror #32-2
2516	eor	w6,w6,w7,ror #32-10
2517	eor	w6,w6,w7,ror #32-18
2518	eor	w6,w6,w7,ror #32-24
2519	eor	w14,w14,w6
2520	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2521	eor	w6,w12,w13
2522	eor	w9,w14,w8
2523	eor	w6,w6,w9
2524	movi	v31.16b, #0x0f
2525	mov	v3.s[0],w6
2526	// optimize sbox using AESE instruction
2527	tbl	v0.16b, {v3.16b}, v26.16b
2528	ushr	v2.16b, v0.16b, 4
2529	and	v0.16b, v0.16b, v31.16b
2530	tbl	v0.16b, {v28.16b}, v0.16b
2531	tbl	v2.16b, {v27.16b}, v2.16b
2532	eor	v0.16b, v0.16b, v2.16b
2533	eor	v1.16b, v1.16b, v1.16b
2534	aese	v0.16b,v1.16b
2535	ushr	v2.16b, v0.16b, 4
2536	and	v0.16b, v0.16b, v31.16b
2537	tbl	v0.16b, {v30.16b}, v0.16b
2538	tbl	v2.16b, {v29.16b}, v2.16b
2539	eor	v0.16b, v0.16b, v2.16b
2540
2541	mov	w7,v0.s[0]
2542	eor	w6,w7,w7,ror #32-2
2543	eor	w6,w6,w7,ror #32-10
2544	eor	w6,w6,w7,ror #32-18
2545	eor	w6,w6,w7,ror #32-24
2546	eor	w15,w15,w6
2547	subs	w11,w11,#1
2548	b.ne	10b
2549	mov	v4.s[0],w15
2550	mov	v4.s[1],w14
2551	mov	v4.s[2],w13
2552	mov	v4.s[3],w12
2553#ifndef HITLS_BIG_ENDIAN
2554	rev32	v4.16b,v4.16b
2555#endif
2556	eor	v4.16b, v4.16b, v18.16b
2557	st1	{v4.4s}, [x26]
2558.Lxts_cipher_return:
2559	add	sp, sp, #192
2560	ldp	d14, d15, [sp], #0x10
2561	ldp	d12, d13, [sp], #0x10
2562	ldp	d10, d11, [sp], #0x10
2563	ldp	d8, d9, [sp], #0x10
2564	ldp	x29, x30, [sp], #0x10
2565	ldp	x27, x28, [sp], #0x10
2566	ldp	x25, x26, [sp], #0x10
2567	ldp	x23, x24, [sp], #0x10
2568	ldp	x21, x22, [sp], #0x10
2569	ldp	x19, x20, [sp], #0x10
2570AARCH64_AUTIASP
2571	ret
2572.size	Vpsm4XtsCipher,.-Vpsm4XtsCipher
2573
2574.globl	Vpsm4Cfb128Encrypt
2575.type	Vpsm4Cfb128Encrypt,%function
2576.align	5
2577Vpsm4Cfb128Encrypt:
2578AARCH64_PACIASP
2579	stp	x29,x30,[sp,#-80]!
2580	add	x29,sp,#0
2581	stp	x19,x20,[sp,#16]
2582	stp	x21,x22,[sp,#32]
2583	stp	x23,x8,[sp,#48]
2584	stp	x16,x17,[sp,#64]
2585
2586	// load tbox
2587	adrp	 x19, .Ltbox1
2588    add x19,x19,:lo12:.Ltbox1
2589	adrp	 x20, .Ltbox2
2590    add x20,x20,:lo12:.Ltbox2
2591	adrp	 x21, .Ltbox3
2592    add x21,x21,:lo12:.Ltbox3
2593	adrp	 x22, .Ltbox4
2594    add x22,x22,:lo12:.Ltbox4
2595
2596	// load num
2597	ldr	w23,[x5]
2598	cbz	w23,.Lcfb128_enc_update
2599.Lcfb128_enc_init:
2600	ldrb	w7,[ivp,x23]
2601	ldrb	w8,[inp]
2602	eor	w7,w7,w8
2603	strb	w7,[outp]
2604	strb	w7,[ivp,x23]
2605
2606	add	inp,inp,#1
2607	add	outp,outp,#1
2608	add	w23,w23,#1
2609	sub	len,len,#1
2610	cmp	w23,#16
2611	b.eq	.Lcfb128_enc_init_final
2612	cbz	len,.Lcfb128_enc_ret
2613	b	.Lcfb128_enc_init
2614.Lcfb128_enc_init_final:
2615	mov	w23,#0
2616.Lcfb128_enc_update:
2617	cbz	len,.Lcfb128_enc_ret
2618	// load iv
2619	ldp	w8,w9,[ivp]
2620	ldp	w10,w11,[ivp,#8]
2621#ifndef HITLS_BIG_ENDIAN
2622	rev	w8,w8
2623	rev	w9,w9
2624	rev	w10,w10
2625	rev	w11,w11
2626#endif
2627	EncRound
2628#ifndef HITLS_BIG_ENDIAN
2629	rev	w8,w8
2630	rev	w9,w9
2631	rev	w10,w10
2632	rev	w11,w11
2633#endif
2634	// save back IV
2635	stp	w11,w10,[ivp]
2636	stp	w9,w8,[ivp,#8]
2637
2638	cmp	len,#16
2639	b.lt	.Lcfb128_enc_final
2640	// xor with plain
2641	ldp	w6,w7,[inp],#8
2642	ldp	w16,w17,[inp],#8
2643	eor	w11,w11,w6
2644	eor	w10,w10,w7
2645	eor	w9,w9,w16
2646	eor	w8,w8,w17
2647
2648	stp	w11,w10,[outp],#8
2649	stp	w9,w8,[outp],#8
2650	// save back IV
2651	stp	w11,w10,[ivp]
2652	stp	w9,w8,[ivp,#8]
2653
2654	sub	len,len,#16
2655	b	.Lcfb128_enc_update
2656.Lcfb128_enc_final:
2657	ldrb	w7,[ivp,x23]
2658	ldrb	w8,[inp]
2659	eor	w7,w7,w8
2660	strb	w7,[outp]
2661	strb	w7,[ivp,x23]
2662
2663	add	inp,inp,#1
2664	add	outp,outp,#1
2665	add	w23,w23,#1
2666	subs	len,len,#1
2667	b.ne	.Lcfb128_enc_final
2668.Lcfb128_enc_ret:
2669	// store num
2670	str	w23,[x5]
2671
2672	// restore register
2673	ldp	x19,x20,[sp,#16]
2674	ldp	x21,x22,[sp,#32]
2675	ldp	x23,x8,[sp,#48]
2676	ldp	x16,x17,[sp,#64]
2677	ldp	x29,x30,[sp],#80
2678AARCH64_AUTIASP
2679	ret
2680.size	Vpsm4Cfb128Encrypt,.-Vpsm4Cfb128Encrypt
2681
2682# void Vpsm4Cfb128Decrypt(const uint8_t *in, uint8_t *out, uint64_t len, const uint32_t *key, uint8_t *iv, int *num);
2683.globl	Vpsm4Cfb128Decrypt
2684.type	Vpsm4Cfb128Decrypt,%function
2685.align	5
2686Vpsm4Cfb128Decrypt:
2687AARCH64_PACIASP
2688	stp	x29,x30,[sp,#-128]!
2689	stp	x19,x20,[sp,#16]
2690	stp	x21,x22,[sp,#32]
2691	stp	x23,x24,[sp,#48]
2692	stp	d8,d9,[sp,#64]
2693	stp	d10,d11,[sp,#80]
2694	stp	d12,d13,[sp,#96]
2695	stp	d14,d15,[sp,#112]
2696
2697	// load tbox
2698	adrp	 x19, .Ltbox1
2699    add x19,x19,:lo12:.Ltbox1
2700	adrp	 x20, .Ltbox2
2701    add x20,x20,:lo12:.Ltbox2
2702	adrp	 x21, .Ltbox3
2703    add x21,x21,:lo12:.Ltbox3
2704	adrp	 x22, .Ltbox4
2705    add x22,x22,:lo12:.Ltbox4
2706	LoadSbox
2707// load num
2708	ldr	w23,[x5]
2709	cbz	w23,.Lcfb128_12_blocks_dec
2710
2711.Lcfb128_dec_init:
2712	ldrb	w7,[ivp,x23]
2713	ldrb	w8,[inp]
2714	eor	w7,w7,w8
2715	strb	w7,[outp]
2716	// store in to iv
2717	strb	w8,[ivp,x23]
2718
2719	add	inp,inp,#1
2720	add	outp,outp,#1
2721	subs len,len,#1
2722	add	w23,w23,#1
2723	and w23,w23,#15
2724	b.eq	100f
2725	cbz	w23,.Lcfb128_12_blocks_dec
2726	b	.Lcfb128_dec_init
2727
2728.Lcfb128_12_blocks_dec:
2729	cmp	len,#192
2730	b.lt	.Lcfb128_8_blocks_dec
2731
2732	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[inp]
2733	// append iv as last element
2734	ld4	{v4.s,v5.s,v6.s,v7.s}[3],[ivp]
2735	add	ptr,inp,#48
2736	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[ptr]
2737	add	ptr,ptr,#64
2738	ld4	{v16.4s,v17.4s,v18.4s,v19.4s},[ptr]
2739#ifndef HITLS_BIG_ENDIAN
2740	rev32	v4.16b,v4.16b
2741	rev32	v5.16b,v5.16b
2742	rev32	v6.16b,v6.16b
2743	rev32	v7.16b,v7.16b
2744
2745	rev32	v8.16b,v8.16b
2746	rev32	v9.16b,v9.16b
2747	rev32	v10.16b,v10.16b
2748	rev32	v11.16b,v11.16b
2749
2750	rev32	v16.16b,v16.16b
2751	rev32	v17.16b,v17.16b
2752	rev32	v18.16b,v18.16b
2753	rev32	v19.16b,v19.16b
2754#endif
2755	bl	Sm4Enc12blks
2756
2757	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d
2758	transpose v4.4s,v5.4s,v6.4s,v7.4s,v4.2d,v5.2d,v6.2d,v7.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d
2759	transpose v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d
2760
2761	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64
2762	ld1	{v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64
2763	eor	v0.16b,v0.16b,v17.16b
2764	eor	v1.16b,v1.16b,v18.16b
2765	eor	v2.16b,v2.16b,v19.16b
2766	eor	v3.16b,v3.16b,v16.16b
2767	// save plainText decrypted from iv as first one
2768	st1	{v3.4s},[outp],#16
2769	st1	{v0.4s,v1.4s,v2.4s},[outp],#48
2770
2771	eor	v4.16b,v4.16b,v12.16b
2772	eor	v5.16b,v5.16b,v13.16b
2773	eor	v6.16b,v6.16b,v14.16b
2774	eor	v7.16b,v7.16b,v15.16b
2775	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64
2776
2777	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64
2778	eor	v8.16b,v8.16b,v16.16b
2779	eor	v9.16b,v9.16b,v17.16b
2780	eor	v10.16b,v10.16b,v18.16b
2781	eor	v11.16b,v11.16b,v19.16b
2782	st1	{v8.4s,v9.4s,v10.4s,v11.4s},[outp],#64
2783	// save back IV
2784	st1	{v19.4s}, [ivp]
2785
2786	subs	len,len,#192
2787	b.gt	.Lcfb128_12_blocks_dec
2788	b.eq	100f
2789
2790.Lcfb128_8_blocks_dec:
2791	cmp	len,#128
2792	b.lt	.Lcfb128_4_blocks_dec
2793
2794	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[inp]
2795	// append iv as last element
2796	ld4	{v4.s,v5.s,v6.s,v7.s}[3],[ivp]
2797	add	ptr,inp,#48
2798	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[ptr]
2799#ifndef HITLS_BIG_ENDIAN
2800	rev32	v4.16b,v4.16b
2801	rev32	v5.16b,v5.16b
2802	rev32	v6.16b,v6.16b
2803	rev32	v7.16b,v7.16b
2804	rev32	v8.16b,v8.16b
2805	rev32	v9.16b,v9.16b
2806	rev32	v10.16b,v10.16b
2807	rev32	v11.16b,v11.16b
2808#endif
2809	bl	Sm4Enc8blks
2810	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
2811	transpose v4.4s,v5.4s,v6.4s,v7.4s,v4.2d,v5.2d,v6.2d,v7.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
2812
2813	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[inp],#64
2814	ld1	{v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64
2815	eor	v0.16b,v0.16b,v9.16b
2816	eor	v1.16b,v1.16b,v10.16b
2817	eor	v2.16b,v2.16b,v11.16b
2818	eor	v3.16b,v3.16b,v8.16b
2819	// save back IV
2820	st1	{v15.4s}, [ivp]
2821	eor	v4.16b,v4.16b,v12.16b
2822	eor	v5.16b,v5.16b,v13.16b
2823	eor	v6.16b,v6.16b,v14.16b
2824	eor	v7.16b,v7.16b,v15.16b
2825	st1	{v3.4s},[outp],#16
2826	st1	{v0.4s,v1.4s,v2.4s},[outp],#48
2827	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64
2828	subs	len,len,#128
2829	b.gt	.Lcfb128_8_blocks_dec
2830	b.eq	100f
2831.Lcfb128_4_blocks_dec:
2832	cmp	len,#64
2833	b.lt	.Llast_block
2834	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[inp]
2835	// append iv as last element
2836	ld4	{v4.s,v5.s,v6.s,v7.s}[3],[ivp]
2837#ifndef HITLS_BIG_ENDIAN
2838	rev32	v4.16b,v4.16b
2839	rev32	v5.16b,v5.16b
2840	rev32	v6.16b,v6.16b
2841	rev32	v7.16b,v7.16b
2842#endif
2843	bl	Sm4Enc4blks
2844	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[inp],#64
2845	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
2846	eor	v0.16b,v0.16b,v5.16b
2847	eor	v1.16b,v1.16b,v6.16b
2848	eor	v2.16b,v2.16b,v7.16b
2849	eor	v3.16b,v3.16b,v4.16b
2850	st1	{v3.4s},[outp],#16
2851	st1	{v0.4s,v1.4s,v2.4s},[outp],#48
2852	// save back IV
2853	st1	{v7.4s}, [ivp]
2854	subs	len,len,#64
2855	b.gt	.Lcfb128_4_blocks_dec
2856	b.eq	100f
2857
2858.Llast_block:	// last block
2859	cmp	len,#16
2860	b.gt	.Llast_2_blocks
28611:
2862	// load in
2863	ldp	w6,w7,[inp]
2864	ldp	w16,w17,[inp,#8]
2865	// load iv
2866	ldp	w8,w9,[ivp]
2867	ldp	w10,w11,[ivp,#8]
2868#ifndef HITLS_BIG_ENDIAN
2869	rev	w8,w8
2870	rev	w9,w9
2871	rev	w10,w10
2872	rev	w11,w11
2873#endif
2874	EncRound
2875#ifndef HITLS_BIG_ENDIAN
2876	rev	w8,w8
2877	rev	w9,w9
2878	rev	w10,w10
2879	rev	w11,w11
2880#endif
2881	// save encrypted iv
2882	stp	w11,w10,[ivp]
2883	stp	w9,w8,[ivp,#8]
2884
2885	cmp	len,#16
2886	b.lt	.Lcfb128_dec_final
2887
2888	stp	w6,w7,[ivp]
2889	stp	w16,w17,[ivp,#8]
2890	eor	w11,w11,w6
2891	eor	w10,w10,w7
2892	eor	w9,w9,w16
2893	eor	w8,w8,w17
2894	stp	w11,w10,[outp],#8
2895	stp	w9,w8,[outp],#8
2896	add	inp,inp,#16
2897	subs	len,len,#16
2898	b.gt	1b
2899	b.eq	100f
2900	b	.Lcfb128_dec_final
2901.Llast_2_blocks:	//	last two blocks
2902	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[ivp]
2903	mov	ptr,inp
2904	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[ptr],#16
2905
2906	cmp	x2,#32
2907	b.gt	.Llast_3_blocks
2908	b.lt	1b
29091:
2910#ifndef HITLS_BIG_ENDIAN
2911	rev32	v4.16b,v4.16b
2912	rev32	v5.16b,v5.16b
2913	rev32	v6.16b,v6.16b
2914	rev32	v7.16b,v7.16b
2915#endif
2916	bl	Sm4Enc4blks
2917	ld1	{v4.4s,v5.4s},[inp],#32
2918	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
2919	eor	v0.16b,v0.16b,v4.16b
2920	eor	v1.16b,v1.16b,v5.16b
2921	st1	{v0.4s,v1.4s},[outp],#32
2922	// save back IV
2923	st1	{v5.4s}, [ivp]
2924	subs	len,len,#32
2925	b.eq	100f
2926	b	.Llast_block
2927.Llast_3_blocks:	// last 3 blocks
2928	cmp	len,#48
2929	b.lt	1b
2930	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[ptr]
2931#ifndef HITLS_BIG_ENDIAN
2932	rev32	v4.16b,v4.16b
2933	rev32	v5.16b,v5.16b
2934	rev32	v6.16b,v6.16b
2935	rev32	v7.16b,v7.16b
2936#endif
2937	bl	Sm4Enc4blks
2938	ld1	{v4.4s,v5.4s,v6.4s},[inp],#48
2939	transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d
2940	eor	v0.16b,v0.16b,v4.16b
2941	eor	v1.16b,v1.16b,v5.16b
2942	eor	v2.16b,v2.16b,v6.16b
2943	st1	{v0.4s,v1.4s,v2.4s},[outp],#48
2944	// save back IV
2945	st1	{v6.4s}, [ivp]
2946	subs	len,len,#48
2947	b.eq	100f
2948	b	.Llast_block
2949.Lcfb128_dec_final:
2950	ldrb	w7,[ivp,x23]
2951	ldrb	w8,[inp]
2952	eor	w7,w7,w8
2953	strb	w7,[outp]
2954	// store in to iv
2955	strb	w8,[ivp,x23]
2956
2957	add	inp,inp,#1
2958	add	outp,outp,#1
2959	add	w23,w23,#1
2960	subs	len,len,#1
2961	b.ne	.Lcfb128_dec_final
2962100:
2963	// store num
2964	str	w23,[x5]
2965	ldp	x19,x20,[sp,#16]
2966	ldp	x21,x22,[sp,#32]
2967	ldp	x23,x24,[sp,#48]
2968	ldp	d8,d9,[sp,#64]
2969	ldp	d10,d11,[sp,#80]
2970	ldp	d12,d13,[sp,#96]
2971	ldp	d14,d15,[sp,#112]
2972	ldp	x29,x30,[sp],#128
2973AARCH64_AUTIASP
2974	ret
2975.size	Vpsm4Cfb128Decrypt,.-Vpsm4Cfb128Decrypt
2976
2977#endif
2978