1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_SM4 18 19#include "crypt_arm.h" 20.arch armv8-a+crypto 21 22rk0 .req v12 23rk1 .req v13 24rka .req v14 25rkb .req v15 26rk2 .req v20 27rkc .req v21 28 29vtmp0 .req v0 30vtmp1 .req v1 31vtmp2 .req v2 32vtmp3 .req v3 33 34vtmp4 .req v24 35vtmp5 .req v25 36vtmp6 .req v22 37vtmp7 .req v23 38 39data0 .req v4 40data1 .req v5 41data2 .req v6 42data3 .req v7 43 44datax0 .req v8 45datax1 .req v9 46datax2 .req v10 47datax3 .req v11 48 49vtmpx0 .req v12 50vtmpx1 .req v13 51vtmpx2 .req v14 52vtmpx3 .req v15 53 54data10 .req v16 55data11 .req v17 56data12 .req v18 57data13 .req v19 58 59MaskV .req v26 60TAHMatV .req v27 61TALMatV .req v28 62ATAHMatV .req v29 63ATALMatV .req v30 64ANDMaskV .req v31 65 66MaskQ .req q26 67TAHMatQ .req q27 68TALMatQ .req q28 69ATAHMatQ .req q29 70ATALMatQ .req q30 71ANDMaskQ .req q31 72vtmp5q .req q25 73vtmp6q .req q22 74vtmp7q .req q23 75 76inp .req x0 77outp .req x1 78blocks .req w2 79rks .req x3 80 81wtmp0 .req w7 82wtmp1 .req w8 83wtmp2 .req w9 84 85ptr .req x10 86counter .req w11 87 88word0 .req w12 89word1 .req w13 90word2 .req w14 91word3 .req w15 92 93xword1 .req x13 94tbox0 .req x19 95tbox1 .req x20 96tbox2 .req x21 97tbox3 .req x22 98 99len .req x2 100ivp .req x4 101ctr .req w5 102ivec .req v3 103ivec1 .req v15 104 105.section .rodata 106.align 4 107.Ltbox1: 108.word 0xd55b5b8e, 0x924242d0, 0xeaa7a74d, 0xfdfbfb06, 0xcf3333fc, 0xe2878765, 0x3df4f4c9, 0xb5dede6b, 0x1658584e 109.word 0xb4dada6e, 0x14505044, 0xc10b0bca, 0x28a0a088, 0xf8efef17, 0x2cb0b09c, 0x05141411, 0x2bacac87, 0x669d9dfb 110.word 0x986a6af2, 0x77d9d9ae, 0x2aa8a882, 0xbcfafa46, 0x04101014, 0xc00f0fcf, 0xa8aaaa02, 0x45111154, 0x134c4c5f 111.word 0x269898be, 0x4825256d, 0x841a1a9e, 0x0618181e, 0x9b6666fd, 0x9e7272ec, 0x4309094a, 0x51414110, 0xf7d3d324 112.word 0x934646d5, 0xecbfbf53, 0x9a6262f8, 0x7be9e992, 0x33ccccff, 0x55515104, 0x0b2c2c27, 0x420d0d4f, 0xeeb7b759 113.word 0xcc3f3ff3, 0xaeb2b21c, 0x638989ea, 0xe7939374, 0xb1cece7f, 0x1c70706c, 0xaba6a60d, 0xca2727ed, 0x08202028 114.word 0xeba3a348, 0x975656c1, 0x82020280, 0xdc7f7fa3, 0x965252c4, 0xf9ebeb12, 0x74d5d5a1, 0x8d3e3eb3, 0x3ffcfcc3 115.word 0xa49a9a3e, 0x461d1d5b, 0x071c1c1b, 0xa59e9e3b, 0xfff3f30c, 0xf0cfcf3f, 0x72cdcdbf, 0x175c5c4b, 0xb8eaea52 116.word 0x810e0e8f, 0x5865653d, 0x3cf0f0cc, 0x1964647d, 0xe59b9b7e, 0x87161691, 0x4e3d3d73, 0xaaa2a208, 0x69a1a1c8 117.word 0x6aadadc7, 0x83060685, 0xb0caca7a, 0x70c5c5b5, 0x659191f4, 0xd96b6bb2, 0x892e2ea7, 0xfbe3e318, 0xe8afaf47 118.word 0x0f3c3c33, 0x4a2d2d67, 0x71c1c1b0, 0x5759590e, 0x9f7676e9, 0x35d4d4e1, 0x1e787866, 0x249090b4, 0x0e383836 119.word 0x5f797926, 0x628d8def, 0x59616138, 0xd2474795, 0xa08a8a2a, 0x259494b1, 0x228888aa, 0x7df1f18c, 0x3bececd7 120.word 0x01040405, 0x218484a5, 0x79e1e198, 0x851e1e9b, 0xd7535384, 0x00000000, 0x4719195e, 0x565d5d0b, 0x9d7e7ee3 121.word 0xd04f4f9f, 0x279c9cbb, 0x5349491a, 0x4d31317c, 0x36d8d8ee, 0x0208080a, 0xe49f9f7b, 0xa2828220, 0xc71313d4 122.word 0xcb2323e8, 0x9c7a7ae6, 0xe9abab42, 0xbdfefe43, 0x882a2aa2, 0xd14b4b9a, 0x41010140, 0xc41f1fdb, 0x38e0e0d8 123.word 0xb7d6d661, 0xa18e8e2f, 0xf4dfdf2b, 0xf1cbcb3a, 0xcd3b3bf6, 0xfae7e71d, 0x608585e5, 0x15545441, 0xa3868625 124.word 0xe3838360, 0xacbaba16, 0x5c757529, 0xa6929234, 0x996e6ef7, 0x34d0d0e4, 0x1a686872, 0x54555501, 0xafb6b619 125.word 0x914e4edf, 0x32c8c8fa, 0x30c0c0f0, 0xf6d7d721, 0x8e3232bc, 0xb3c6c675, 0xe08f8f6f, 0x1d747469, 0xf5dbdb2e 126.word 0xe18b8b6a, 0x2eb8b896, 0x800a0a8a, 0x679999fe, 0xc92b2be2, 0x618181e0, 0xc30303c0, 0x29a4a48d, 0x238c8caf 127.word 0xa9aeae07, 0x0d343439, 0x524d4d1f, 0x4f393976, 0x6ebdbdd3, 0xd6575781, 0xd86f6fb7, 0x37dcdceb, 0x44151551 128.word 0xdd7b7ba6, 0xfef7f709, 0x8c3a3ab6, 0x2fbcbc93, 0x030c0c0f, 0xfcffff03, 0x6ba9a9c2, 0x73c9c9ba, 0x6cb5b5d9 129.word 0x6db1b1dc, 0x5a6d6d37, 0x50454515, 0x8f3636b9, 0x1b6c6c77, 0xadbebe13, 0x904a4ada, 0xb9eeee57, 0xde7777a9 130.word 0xbef2f24c, 0x7efdfd83, 0x11444455, 0xda6767bd, 0x5d71712c, 0x40050545, 0x1f7c7c63, 0x10404050, 0x5b696932 131.word 0xdb6363b8, 0x0a282822, 0xc20707c5, 0x31c4c4f5, 0x8a2222a8, 0xa7969631, 0xce3737f9, 0x7aeded97, 0xbff6f649 132.word 0x2db4b499, 0x75d1d1a4, 0xd3434390, 0x1248485a, 0xbae2e258, 0xe6979771, 0xb6d2d264, 0xb2c2c270, 0x8b2626ad 133.word 0x68a5a5cd, 0x955e5ecb, 0x4b292962, 0x0c30303c, 0x945a5ace, 0x76ddddab, 0x7ff9f986, 0x649595f1, 0xbbe6e65d 134.word 0xf2c7c735, 0x0924242d, 0xc61717d1, 0x6fb9b9d6, 0xc51b1bde, 0x86121294, 0x18606078, 0xf3c3c330, 0x7cf5f589 135.word 0xefb3b35c, 0x3ae8e8d2, 0xdf7373ac, 0x4c353579, 0x208080a0, 0x78e5e59d, 0xedbbbb56, 0x5e7d7d23, 0x3ef8f8c6 136.word 0xd45f5f8b, 0xc82f2fe7, 0x39e4e4dd, 0x49212168 137 138.Ltbox2: 139.word 0x5b5b8ed5, 0x4242d092, 0xa7a74dea, 0xfbfb06fd, 0x3333fccf, 0x878765e2, 0xf4f4c93d, 0xdede6bb5, 0x58584e16 140.word 0xdada6eb4, 0x50504414, 0x0b0bcac1, 0xa0a08828, 0xefef17f8, 0xb0b09c2c, 0x14141105, 0xacac872b, 0x9d9dfb66 141.word 0x6a6af298, 0xd9d9ae77, 0xa8a8822a, 0xfafa46bc, 0x10101404, 0x0f0fcfc0, 0xaaaa02a8, 0x11115445, 0x4c4c5f13 142.word 0x9898be26, 0x25256d48, 0x1a1a9e84, 0x18181e06, 0x6666fd9b, 0x7272ec9e, 0x09094a43, 0x41411051, 0xd3d324f7 143.word 0x4646d593, 0xbfbf53ec, 0x6262f89a, 0xe9e9927b, 0xccccff33, 0x51510455, 0x2c2c270b, 0xd0d4f42, 0xb7b759ee 144.word 0x3f3ff3cc, 0xb2b21cae, 0x8989ea63, 0x939374e7, 0xcece7fb1, 0x70706c1c, 0xa6a60dab, 0x2727edca, 0x20202808 145.word 0xa3a348eb, 0x5656c197, 0x02028082, 0x7f7fa3dc, 0x5252c496, 0xebeb12f9, 0xd5d5a174, 0x3e3eb38d, 0xfcfcc33f 146.word 0x9a9a3ea4, 0x1d1d5b46, 0x1c1c1b07, 0x9e9e3ba5, 0xf3f30cff, 0xcfcf3ff0, 0xcdcdbf72, 0x5c5c4b17, 0xeaea52b8 147.word 0x0e0e8f81, 0x65653d58, 0xf0f0cc3c, 0x64647d19, 0x9b9b7ee5, 0x16169187, 0x3d3d734e, 0xa2a208aa, 0xa1a1c869 148.word 0xadadc76a, 0x06068583, 0xcaca7ab0, 0xc5c5b570, 0x9191f465, 0x6b6bb2d9, 0x2e2ea789, 0xe3e318fb, 0xafaf47e8 149.word 0x3c3c330f, 0x2d2d674a, 0xc1c1b071, 0x59590e57, 0x7676e99f, 0xd4d4e135, 0x7878661e, 0x9090b424, 0x3838360e 150.word 0x7979265f, 0x8d8def62, 0x61613859, 0x474795d2, 0x8a8a2aa0, 0x9494b125, 0x8888aa22, 0xf1f18c7d, 0xececd73b 151.word 0x04040501, 0x8484a521, 0xe1e19879, 0x1e1e9b85, 0x535384d7, 0x00000000, 0x19195e47, 0x5d5d0b56, 0x7e7ee39d 152.word 0x4f4f9fd0, 0x9c9cbb27, 0x49491a53, 0x31317c4d, 0xd8d8ee36, 0x08080a02, 0x9f9f7be4, 0x828220a2, 0x1313d4c7 153.word 0x2323e8cb, 0x7a7ae69c, 0xabab42e9, 0xfefe43bd, 0x2a2aa288, 0x4b4b9ad1, 0x01014041, 0x1f1fdbc4, 0xe0e0d838 154.word 0xd6d661b7, 0x8e8e2fa1, 0xdfdf2bf4, 0xcbcb3af1, 0x3b3bf6cd, 0xe7e71dfa, 0x8585e560, 0x54544115, 0x868625a3 155.word 0x838360e3, 0xbaba16ac, 0x7575295c, 0x929234a6, 0x6e6ef799, 0xd0d0e434, 0x6868721a, 0x55550154, 0xb6b619af 156.word 0x4e4edf91, 0xc8c8fa32, 0xc0c0f030, 0xd7d721f6, 0x3232bc8e, 0xc6c675b3, 0x8f8f6fe0, 0x7474691d, 0xdbdb2ef5 157.word 0x8b8b6ae1, 0xb8b8962e, 0x0a0a8a80, 0x9999fe67, 0x2b2be2c9, 0x8181e061, 0x0303c0c3, 0xa4a48d29, 0x8c8caf23 158.word 0xaeae07a9, 0x3434390d, 0x4d4d1f52, 0x3939764f, 0xbdbdd36e, 0x575781d6, 0x6f6fb7d8, 0xdcdceb37, 0x15155144 159.word 0x7b7ba6dd, 0xf7f709fe, 0x3a3ab68c, 0xbcbc932f, 0x0c0c0f03, 0xffff03fc, 0xa9a9c26b, 0xc9c9ba73, 0xb5b5d96c 160.word 0xb1b1dc6d, 0x6d6d375a, 0x45451550, 0x3636b98f, 0x6c6c771b, 0xbebe13ad, 0x4a4ada90, 0xeeee57b9, 0x7777a9de 161.word 0xf2f24cbe, 0xfdfd837e, 0x44445511, 0x6767bdda, 0x71712c5d, 0x05054540, 0x7c7c631f, 0x40405010, 0x6969325b 162.word 0x6363b8db, 0x2828220a, 0x0707c5c2, 0xc4c4f531, 0x2222a88a, 0x969631a7, 0x3737f9ce, 0xeded977a, 0xf6f649bf 163.word 0xb4b4992d, 0xd1d1a475, 0x434390d3, 0x48485a12, 0xe2e258ba, 0x979771e6, 0xd2d264b6, 0xc2c270b2, 0x2626ad8b 164.word 0xa5a5cd68, 0x5e5ecb95, 0x2929624b, 0x30303c0c, 0x5a5ace94, 0xddddab76, 0xf9f9867f, 0x9595f164, 0xe6e65dbb 165.word 0xc7c735f2, 0x24242d09, 0x1717d1c6, 0xb9b9d66f, 0x1b1bdec5, 0x12129486, 0x60607818, 0xc3c330f3, 0xf5f5897c 166.word 0xb3b35cef, 0xe8e8d23a, 0x7373acdf, 0x3535794c, 0x8080a020, 0xe5e59d78, 0xbbbb56ed, 0x7d7d235e, 0xf8f8c63e 167.word 0x5f5f8bd4, 0x2f2fe7c8, 0xe4e4dd39, 0x21216849 168 169.Ltbox3: 170.word 0x5b8ed55b, 0x42d09242, 0xa74deaa7, 0xfb06fdfb, 0x33fccf33, 0x8765e287, 0xf4c93df4, 0xde6bb5de, 0x584e1658 171.word 0xda6eb4da, 0x50441450, 0x0bcac10b, 0xa08828a0, 0xef17f8ef, 0xb09c2cb0, 0x14110514, 0xac872bac, 0x9dfb669d 172.word 0x6af2986a, 0xd9ae77d9, 0xa8822aa8, 0xfa46bcfa, 0x10140410, 0x0fcfc00f, 0xaa02a8aa, 0x11544511, 0x4c5f134c 173.word 0x98be2698, 0x256d4825, 0x1a9e841a, 0x181e0618, 0x66fd9b66, 0x72ec9e72, 0x094a4309, 0x41105141, 0xd324f7d3 174.word 0x46d59346, 0xbf53ecbf, 0x62f89a62, 0xe9927be9, 0xccff33cc, 0x51045551, 0x2c270b2c, 0x0d4f420d, 0xb759eeb7 175.word 0x3ff3cc3f, 0xb21caeb2, 0x89ea6389, 0x9374e793, 0xce7fb1ce, 0x706c1c70, 0xa60daba6, 0x27edca27, 0x20280820 176.word 0xa348eba3, 0x56c19756, 0x02808202, 0x7fa3dc7f, 0x52c49652, 0xeb12f9eb, 0xd5a174d5, 0x3eb38d3e, 0xfcc33ffc 177.word 0x9a3ea49a, 0x1d5b461d, 0x1c1b071c, 0x9e3ba59e, 0xf30cfff3, 0xcf3ff0cf, 0xcdbf72cd, 0x5c4b175c, 0xea52b8ea 178.word 0x0e8f810e, 0x653d5865, 0xf0cc3cf0, 0x647d1964, 0x9b7ee59b, 0x16918716, 0x3d734e3d, 0xa208aaa2, 0xa1c869a1 179.word 0xadc76aad, 0x06858306, 0xca7ab0ca, 0xc5b570c5, 0x91f46591, 0x6bb2d96b, 0x2ea7892e, 0xe318fbe3, 0xaf47e8af 180.word 0x3c330f3c, 0x2d674a2d, 0xc1b071c1, 0x590e5759, 0x76e99f76, 0xd4e135d4, 0x78661e78, 0x90b42490, 0x38360e38 181.word 0x79265f79, 0x8def628d, 0x61385961, 0x4795d247, 0x8a2aa08a, 0x94b12594, 0x88aa2288, 0xf18c7df1, 0xecd73bec 182.word 0x04050104, 0x84a52184, 0xe19879e1, 0x1e9b851e, 0x5384d753, 0x00000000, 0x195e4719, 0x5d0b565d, 0x7ee39d7e 183.word 0x4f9fd04f, 0x9cbb279c, 0x491a5349, 0x317c4d31, 0xd8ee36d8, 0x080a0208, 0x9f7be49f, 0x8220a282, 0x13d4c713 184.word 0x23e8cb23, 0x7ae69c7a, 0xab42e9ab, 0xfe43bdfe, 0x2aa2882a, 0x4b9ad14b, 0x01404101, 0x1fdbc41f, 0xe0d838e0 185.word 0xd661b7d6, 0x8e2fa18e, 0xdf2bf4df, 0xcb3af1cb, 0x3bf6cd3b, 0xe71dfae7, 0x85e56085, 0x54411554, 0x8625a386 186.word 0x8360e383, 0xba16acba, 0x75295c75, 0x9234a692, 0x6ef7996e, 0xd0e434d0, 0x68721a68, 0x55015455, 0xb619afb6 187.word 0x4edf914e, 0xc8fa32c8, 0xc0f030c0, 0xd721f6d7, 0x32bc8e32, 0xc675b3c6, 0x8f6fe08f, 0x74691d74, 0xdb2ef5db 188.word 0x8b6ae18b, 0xb8962eb8, 0x0a8a800a, 0x99fe6799, 0x2be2c92b, 0x81e06181, 0x03c0c303, 0xa48d29a4, 0x8caf238c 189.word 0xae07a9ae, 0x34390d34, 0x4d1f524d, 0x39764f39, 0xbdd36ebd, 0x5781d657, 0x6fb7d86f, 0xdceb37dc, 0x15514415 190.word 0x7ba6dd7b, 0xf709fef7, 0x3ab68c3a, 0xbc932fbc, 0x0c0f030c, 0xff03fcff, 0xa9c26ba9, 0xc9ba73c9, 0xb5d96cb5 191.word 0xb1dc6db1, 0x6d375a6d, 0x45155045, 0x36b98f36, 0x6c771b6c, 0xbe13adbe, 0x4ada904a, 0xee57b9ee, 0x77a9de77 192.word 0xf24cbef2, 0xfd837efd, 0x44551144, 0x67bdda67, 0x712c5d71, 0x05454005, 0x7c631f7c, 0x40501040, 0x69325b69 193.word 0x63b8db63, 0x28220a28, 0x07c5c207, 0xc4f531c4, 0x22a88a22, 0x9631a796, 0x37f9ce37, 0xed977aed, 0xf649bff6 194.word 0xb4992db4, 0xd1a475d1, 0x4390d343, 0x485a1248, 0xe258bae2, 0x9771e697, 0xd264b6d2, 0xc270b2c2, 0x26ad8b26 195.word 0xa5cd68a5, 0x5ecb955e, 0x29624b29, 0x303c0c30, 0x5ace945a, 0xddab76dd, 0xf9867ff9, 0x95f16495, 0xe65dbbe6 196.word 0xc735f2c7, 0x242d0924, 0x17d1c617, 0xb9d66fb9, 0x1bdec51b, 0x12948612, 0x60781860, 0xc330f3c3, 0xf5897cf5 197.word 0xb35cefb3, 0xe8d23ae8, 0x73acdf73, 0x35794c35, 0x80a02080, 0xe59d78e5, 0xbb56edbb, 0x7d235e7d, 0xf8c63ef8 198.word 0x5f8bd45f, 0x2fe7c82f, 0xe4dd39e4, 0x21684921 199 200.Ltbox4: 201.word 0x8ed55b5b, 0xd0924242, 0x4deaa7a7, 0x06fdfbfb, 0xfccf3333, 0x65e28787, 0xc93df4f4, 0x6bb5dede, 0x4e165858 202.word 0x6eb4dada, 0x44145050, 0xcac10b0b, 0x8828a0a0, 0x17f8efef, 0x9c2cb0b0, 0x11051414, 0x872bacac, 0xfb669d9d 203.word 0xf2986a6a, 0xae77d9d9, 0x822aa8a8, 0x46bcfafa, 0x14041010, 0xcfc00f0f, 0x02a8aaaa, 0x54451111, 0x5f134c4c 204.word 0xbe269898, 0x6d482525, 0x9e841a1a, 0x1e061818, 0xfd9b6666, 0xec9e7272, 0x4a430909, 0x10514141, 0x24f7d3d3 205.word 0xd5934646, 0x53ecbfbf, 0xf89a6262, 0x927be9e9, 0xff33cccc, 0x04555151, 0x270b2c2c, 0x4f420d0d, 0x59eeb7b7 206.word 0xf3cc3f3f, 0x1caeb2b2, 0xea638989, 0x74e79393, 0x7fb1cece, 0x6c1c7070, 0x0daba6a6, 0xedca2727, 0x28082020 207.word 0x48eba3a3, 0xc1975656, 0x80820202, 0xa3dc7f7f, 0xc4965252, 0x12f9ebeb, 0xa174d5d5, 0xb38d3e3e, 0xc33ffcfc 208.word 0x3ea49a9a, 0x5b461d1d, 0x1b071c1c, 0x3ba59e9e, 0x0cfff3f3, 0x3ff0cfcf, 0xbf72cdcd, 0x4b175c5c, 0x52b8eaea 209.word 0x8f810e0e, 0x3d586565, 0xcc3cf0f0, 0x7d196464, 0x7ee59b9b, 0x91871616, 0x734e3d3d, 0x08aaa2a2, 0xc869a1a1 210.word 0xc76aadad, 0x85830606, 0x7ab0caca, 0xb570c5c5, 0xf4659191, 0xb2d96b6b, 0xa7892e2e, 0x18fbe3e3, 0x47e8afaf 211.word 0x330f3c3c, 0x674a2d2d, 0xb071c1c1, 0x0e575959, 0xe99f7676, 0xe135d4d4, 0x661e7878, 0xb4249090, 0x360e3838 212.word 0x265f7979, 0xef628d8d, 0x38596161, 0x95d24747, 0x2aa08a8a, 0xb1259494, 0xaa228888, 0x8c7df1f1, 0xd73becec 213.word 0x05010404, 0xa5218484, 0x9879e1e1, 0x9b851e1e, 0x84d75353, 0x00000000, 0x5e471919, 0x0b565d5d, 0xe39d7e7e 214.word 0x9fd04f4f, 0xbb279c9c, 0x1a534949, 0x7c4d3131, 0xee36d8d8, 0x0a020808, 0x7be49f9f, 0x20a28282, 0xd4c71313 215.word 0xe8cb2323, 0xe69c7a7a, 0x42e9abab, 0x43bdfefe, 0xa2882a2a, 0x9ad14b4b, 0x40410101, 0xdbc41f1f, 0xd838e0e0 216.word 0x61b7d6d6, 0x2fa18e8e, 0x2bf4dfdf, 0x3af1cbcb, 0xf6cd3b3b, 0x1dfae7e7, 0xe5608585, 0x41155454, 0x25a38686 217.word 0x60e38383, 0x16acbaba, 0x295c7575, 0x34a69292, 0xf7996e6e, 0xe434d0d0, 0x721a6868, 0x01545555, 0x19afb6b6 218.word 0xdf914e4e, 0xfa32c8c8, 0xf030c0c0, 0x21f6d7d7, 0xbc8e3232, 0x75b3c6c6, 0x6fe08f8f, 0x691d7474, 0x2ef5dbdb 219.word 0x6ae18b8b, 0x962eb8b8, 0x8a800a0a, 0xfe679999, 0xe2c92b2b, 0xe0618181, 0xc0c30303, 0x8d29a4a4, 0xaf238c8c 220.word 0x07a9aeae, 0x390d3434, 0x1f524d4d, 0x764f3939, 0xd36ebdbd, 0x81d65757, 0xb7d86f6f, 0xeb37dcdc, 0x51441515 221.word 0xa6dd7b7b, 0x09fef7f7, 0xb68c3a3a, 0x932fbcbc, 0x0f030c0c, 0x03fcffff, 0xc26ba9a9, 0xba73c9c9, 0xd96cb5b5 222.word 0xdc6db1b1, 0x375a6d6d, 0x15504545, 0xb98f3636, 0x771b6c6c, 0x13adbebe, 0xda904a4a, 0x57b9eeee, 0xa9de7777 223.word 0x4cbef2f2, 0x837efdfd, 0x55114444, 0xbdda6767, 0x2c5d7171, 0x45400505, 0x631f7c7c, 0x50104040, 0x325b6969 224.word 0xb8db6363, 0x220a2828, 0xc5c20707, 0xf531c4c4, 0xa88a2222, 0x31a79696, 0xf9ce3737, 0x977aeded, 0x49bff6f6 225.word 0x992db4b4, 0xa475d1d1, 0x90d34343, 0x5a124848, 0x58bae2e2, 0x71e69797, 0x64b6d2d2, 0x70b2c2c2, 0xad8b2626 226.word 0xcd68a5a5, 0xcb955e5e, 0x624b2929, 0x3c0c3030, 0xce945a5a, 0xab76dddd, 0x867ff9f9, 0xf1649595, 0x5dbbe6e6 227.word 0x35f2c7c7, 0x2d092424, 0xd1c61717, 0xd66fb9b9, 0xdec51b1b, 0x94861212, 0x78186060, 0x30f3c3c3, 0x897cf5f5 228.word 0x5cefb3b3, 0xd23ae8e8, 0xacdf7373, 0x794c3535, 0xa0208080, 0x9d78e5e5, 0x56edbbbb, 0x235e7d7d, 0xc63ef8f8 229.word 0x8bd45f5f, 0xe7c82f2f, 0xdd39e4e4, 0x68492121 230 231#ifdef HITLS_BIG_ENDIAN 232.Lxts_magic: 233 .quad 0x0101010101010101,0x0101010101010187 234 235.Lsbox_magic: 236 .quad 0x0306090c0f020508,0x0b0e0104070a0d00 237 .quad 0x22581a6002783a40,0x62185a2042387a00 238 .quad 0xc10bb67c4a803df7,0x15df62a89e54e923 239 .quad 0x1407c6d56c7fbead,0xb9aa6b78c1d21300 240 .quad 0xe383c1a1fe9edcbc,0x6404462679195b3b 241 .quad 0x0E0D0C0F0A09080B,0x0605040702010003 242 .quad 0x0D0C0F0E09080B0A,0x0504070601000302 243 .quad 0x0C0F0E0D080B0A09,0x0407060500030201 244#else 245.Lxts_magic: 246 .quad 0x0101010101010187,0x0101010101010101 247 248.Lsbox_magic: 249 .quad 0x0b0e0104070a0d00,0x0306090c0f020508 250 .quad 0x62185a2042387a00,0x22581a6002783a40 251 .quad 0x15df62a89e54e923,0xc10bb67c4a803df7 252 .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead 253 .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc 254 .quad 0x0605040702010003,0x0E0D0C0F0A09080B 255 .quad 0x0504070601000302,0x0D0C0F0E09080B0A 256 .quad 0x0407060500030201,0x0C0F0E0D080B0A09 257#endif 258 259.macro LoadSbox 260 adrp x15,.Lsbox_magic 261 add x15,x15,:lo12:.Lsbox_magic 262 ldr MaskQ, [x15] 263 ldr TAHMatQ, [x15, #16] 264 ldr TALMatQ, [x15, #32] 265 ldr ATAHMatQ, [x15, #48] 266 ldr ATALMatQ, [x15, #64] 267 ldr vtmp5q, [x15, #80] 268 ldr vtmp6q, [x15, #96] 269 ldr vtmp7q, [x15, #112] 270.endm 271 272.macro round x1, x2, x3, x4, rk 273 eor word0,\x2, \x3 274 eor word0, word0, \rk 275 eor word0, word0, \x4 276 277 and word1, word0, #0xff 278 ldr word1, [tbox0,xword1,lsl #2] 279 eor \x1, word1, \x1 280 281 ubfx word1, word0,#8,#8 282 ldr word1, [tbox1, xword1, lsl #2] 283 eor \x1, word1, \x1 284 285 ubfx word1, word0, #16, #8 286 ldr word1,[tbox2, xword1, lsl #2] 287 eor \x1, word1, \x1 288 289 lsr word1, word0, #24 290 ldr word1, [tbox3, xword1, lsl #2] 291 eor \x1, word1, \x1 292.endm 293 294.macro EncRound4 offset1, offset2 295 ldp word2, word3,[rks, \offset1] 296 round w8, w9, w10, w11, word2 297 round w9, w10, w11, w8, word3 298 ldp word2, word3,[rks, \offset2] 299 round w10, w11, w8, w9, word2 300 round w11, w8, w9, w10, word3 301.endm 302 303.macro EncRound 304 EncRound4 0, 8 305 EncRound4 16, 24 306 EncRound4 32, 40 307 EncRound4 48, 56 308 EncRound4 64, 72 309 EncRound4 80, 88 310 EncRound4 96, 104 311 EncRound4 112, 120 312.endm 313 314.macro transpose dat0s, dat1s, dat2s, dat3s, dat0d, dat1d, dat2d, dat3d, vt0s, vt1s, vt2s, vt3s, vt0d, vt1d, vt2d, vt3d 315 zip1 \vt0s, \dat0s, \dat1s 316 zip2 \vt1s, \dat0s, \dat1s 317 zip1 \vt2s, \dat2s, \dat3s 318 zip2 \vt3s, \dat2s, \dat3s 319 zip1 \dat0d, \vt0d, \vt2d 320 zip2 \dat1d, \vt0d, \vt2d 321 zip1 \dat2d, \vt1d, \vt3d 322 zip2 \dat3d, \vt1d, \vt3d 323.endm 324 325.macro Encrypt1blkNorevCtr 326 mov w8,ivec.s[0] 327 mov w9,ivec.s[1] 328 mov w10,ivec.s[2] 329 mov w11,ivec.s[3] 330 EncRound 331 mov ivec.s[0],w11 332 mov ivec.s[1],w10 333 mov ivec.s[2],w9 334 mov ivec.s[3],w8 335#ifndef HITLS_BIG_ENDIAN 336 rev32 v3.16b,v3.16b 337#endif 338.endm 339 340# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) 341.macro MulMatrix x, higherMat, lowerMat, tmp 342 ushr \tmp, \x, 4 343 and \x, \x, ANDMaskV.16b 344 tbl \x, {\lowerMat}, \x 345 tbl \tmp, {\higherMat}, \tmp 346 eor \x, \x, \tmp 347.endm 348 349# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) 350.macro MulMatrixOut x, higherMat, lowerMat, tmp, out 351 ushr \tmp, \x, 4 352 and \x, \x, ANDMaskV.16b 353 tbl \x, {\lowerMat}, \x 354 tbl \tmp, {\higherMat}, \tmp 355 eor \out, \x, \tmp 356.endm 357 358# Sbox operations for 4-lane of words 359.macro Sbox dat, dat2 360 movi ANDMaskV.16b, #0x0f 361 // optimize Sbox using AESE instruction 362 tbl v0.16b, {\dat}, MaskV.16b 363 MulMatrix v0.16b, TAHMatV.16b, TALMatV.16b, v24.16b 364 365 eor v1.16b, v1.16b, v1.16b 366 aese v0.16b, v1.16b 367 368 MulMatrix v0.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b 369 370 mov \dat, v0.16b 371 372 // linear transformation 373 ushr v0.4s, \dat2,32-2 374 ushr v1.4s, \dat2,32-10 375 ushr v2.4s, \dat2,32-18 376 ushr v3.4s, \dat2,32-24 377 sli v0.4s, \dat2,2 378 sli v1.4s, \dat2,10 379 sli v2.4s, \dat2,18 380 sli v3.4s, \dat2,24 381 eor v24.16b, v0.16b, \dat 382 eor v24.16b, v24.16b, v1.16b 383 eor \dat, v2.16b, v3.16b 384 eor \dat, \dat, v24.16b 385.endm 386 387# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 388.macro Sm44blks kptr 389 ldp wtmp0, wtmp1,[\kptr],8 390 dup rk0.4s, wtmp0 391 dup rk1.4s, wtmp1 392 393 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 394 eor rka.16b, v6.16b, v7.16b 395 eor rk0.16b, v5.16b, rk0.16b 396 eor rk0.16b, rka.16b, rk0.16b 397 398 Sbox rk0.16b, rk0.4s 399 400 eor v4.16b, v4.16b, rk0.16b 401 402 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 403 eor rka.16b, rka.16b, v4.16b 404 eor rk1.16b, rka.16b, rk1.16b 405 406 Sbox rk1.16b, rk1.4s 407 408 ldp wtmp0, wtmp1,[\kptr],8 409 eor v5.16b,v5.16b, rk1.16b 410 411 dup rk0.4s, wtmp0 412 dup rk1.4s, wtmp1 413 414 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 415 eor rka.16b, v4.16b, v5.16b 416 eor rk0.16b, v7.16b, rk0.16b 417 eor rk0.16b, rka.16b, rk0.16b 418 419 Sbox rk0.16b, rk0.4s 420 421 eor v6.16b, v6.16b, rk0.16b 422 423 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 424 eor rka.16b, rka.16b, v6.16b 425 eor rk1.16b, rka.16b, rk1.16b 426 427 Sbox rk1.16b, rk1.4s 428 429 eor v7.16b, v7.16b, rk1.16b 430.endm 431 432 433.macro Encrypt4blks 434 mov ptr, rks 435 mov counter,#8 43610: 437 Sm44blks ptr 438 439 subs counter, counter,#1 440 b.ne 10b 441#ifndef HITLS_BIG_ENDIAN 442 rev32 v3.16b,v4.16b 443 rev32 v2.16b,v5.16b 444 rev32 v1.16b,v6.16b 445 rev32 v0.16b,v7.16b 446#else 447 mov v3.16b,v4.16b 448 mov v2.16b,v5.16b 449 mov v1.16b,v6.16b 450 mov v0.16b,v7.16b 451#endif 452.endm 453 454# Sbox operation for 8-lane of words 455.macro SboxDouble dat datx 456 movi ANDMaskV.16b, #0x0f 457 // optimize Sbox using AESE instruction 458 tbl v0.16b, {rk0.16b}, MaskV.16b 459 tbl v1.16b, {rk1.16b}, MaskV.16b 460 461 MulMatrix v0.16b, TAHMatV.16b, TALMatV.16b, v24.16b 462 MulMatrix v1.16b, TAHMatV.16b, TALMatV.16b, v24.16b 463 eor vtmp5.16b, vtmp5.16b, vtmp5.16b 464 aese v0.16b,vtmp5.16b 465 aese v1.16b,vtmp5.16b 466 MulMatrixOut v0.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, rk0.16b 467 MulMatrixOut v1.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, rk1.16b 468 469 // linear transformation 470 ushr v0.4s,rk0.4s,32-2 471 ushr vtmp5.4s,rk1.4s,32-2 472 ushr v1.4s,rk0.4s,32-10 473 ushr v2.4s,rk0.4s,32-18 474 ushr v3.4s,rk0.4s,32-24 475 sli v0.4s,rk0.4s,2 476 sli vtmp5.4s,rk1.4s,2 477 sli v1.4s,rk0.4s,10 478 sli v2.4s,rk0.4s,18 479 sli v3.4s,rk0.4s,24 480 eor v24.16b,v0.16b,rk0.16b 481 eor v24.16b,v24.16b,v1.16b 482 eor rk0.16b,v2.16b,v3.16b 483 eor rk0.16b,rk0.16b,v24.16b 484 ushr v1.4s,rk1.4s,32-10 485 ushr v2.4s,rk1.4s,32-18 486 ushr v3.4s,rk1.4s,32-24 487 sli v1.4s,rk1.4s,10 488 sli v2.4s,rk1.4s,18 489 sli v3.4s,rk1.4s,24 490 eor v24.16b,vtmp5.16b,rk1.16b 491 eor v24.16b,v24.16b,v1.16b 492 eor rk1.16b,v2.16b,v3.16b 493 eor rk1.16b,rk1.16b,v24.16b 494.endm 495 496 497.macro SboxThree dat, datx, dat1 498 movi ANDMaskV.16b, #0x0f 499 // optimize sbox using AESE instruction 500 tbl v0.16b, {\dat}, MaskV.16b 501 tbl v1.16b, {\datx}, MaskV.16b 502 tbl v2.16b, {\dat1}, MaskV.16b 503 eor v3.16b, v3.16b, v3.16b 504 505 MulMatrix v0.16b, TAHMatV.16b, TALMatV.16b, v24.16b 506 MulMatrix v1.16b, TAHMatV.16b, TALMatV.16b, v24.16b 507 508 aese v0.16b, v3.16b 509 510 MulMatrix v2.16b, TAHMatV.16b, TALMatV.16b, v24.16b 511 512 aese v1.16b, v3.16b 513 aese v2.16b, v3.16b 514 515 MulMatrixOut v0.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, \dat 516 MulMatrixOut v1.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, \datx 517 MulMatrixOut v2.16b, ATAHMatV.16b, ATALMatV.16b, v24.16b, \dat1 518 519 // linear transformation 520 tbl v0.16b, {\dat}, vtmp5.16b // shitf left 8 521 tbl v1.16b, {\datx}, vtmp5.16b 522 tbl v2.16b, {\dat1}, vtmp5.16b 523 524 tbl v3.16b, {\dat}, v22.16b // shitf left 16 525 tbl v24.16b, {\datx}, v22.16b 526 tbl ANDMaskV.16b, {\dat1}, v22.16b 527 528 eor v0.16b, v0.16b, \dat 529 eor v1.16b, v1.16b, \datx 530 eor v2.16b, v2.16b, \dat1 531 532 eor v0.16b, v0.16b, v3.16b 533 eor v1.16b, v1.16b, v24.16b 534 eor v2.16b, v2.16b, ANDMaskV.16b 535 536 shl v3.4s, v0.4s, #2 // shift left by 2 bits, equivalent to v12<<2 xor v12<<10 xor v12<<18 537 sri v3.4s, v0.4s, #30 538 shl v24.4s, v1.4s, #2 539 sri v24.4s, v1.4s, #30 540 shl ANDMaskV.4s, v2.4s, #2 541 sri ANDMaskV.4s, v2.4s, #30 542 543 tbl v0.16b, {\dat}, v23.16b // shitf left 24 544 tbl v1.16b, {\datx}, v23.16b 545 tbl v2.16b, {\dat1}, v23.16b 546 547 eor \dat, \dat, v3.16b 548 eor \datx, \datx, v24.16b 549 eor \dat1, \dat1, ANDMaskV.16b 550 551 eor \dat, v0.16b, \dat 552 eor \datx, v1.16b, \datx 553 eor \dat1, v2.16b, \dat1 554.endm 555 556.macro Sm48blks kptr 557 ldp wtmp0, wtmp1,[\kptr],8 558 559 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 560 dup rk0.4s, wtmp0 561 eor rka.16b,v6.16b,v7.16b 562 eor rkb.16b,v10.16b,v11.16b 563 eor v0.16b,v5.16b,rk0.16b 564 eor v1.16b,v9.16b,rk0.16b 565 eor rk0.16b, rka.16b,v0.16b 566 eor rk1.16b, rkb.16b,v1.16b 567 SboxDouble 568 eor v4.16b, v4.16b, rk0.16b 569 eor v8.16b,v8.16b, rk1.16b 570 571 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 572 dup rk1.4s, wtmp1 573 eor rka.16b,rka.16b,v4.16b 574 eor rkb.16b,rkb.16b,v8.16b 575 eor rk0.16b,rka.16b,rk1.16b 576 eor rk1.16b,rkb.16b,rk1.16b 577 SboxDouble 578 579 ldp wtmp0, wtmp1,[\kptr],8 580 eor v5.16b,v5.16b,rk0.16b 581 eor v9.16b,v9.16b,rk1.16b 582 583 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 584 dup rk0.4s, wtmp0 585 eor rka.16b,v4.16b,v5.16b 586 eor rkb.16b,v8.16b,v9.16b 587 eor v0.16b,v7.16b,rk0.16b 588 eor v1.16b,v11.16b,rk0.16b 589 eor rk0.16b,rka.16b,v0.16b 590 eor rk1.16b,rkb.16b,v1.16b 591 SboxDouble 592 593 eor v6.16b,v6.16b,rk0.16b 594 eor v10.16b,v10.16b,rk1.16b 595 596 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 597 dup rk1.4s, wtmp1 598 eor rka.16b,rka.16b,v6.16b 599 eor rkb.16b,rkb.16b,v10.16b 600 eor rk0.16b,rka.16b,rk1.16b 601 eor rk1.16b,rkb.16b,rk1.16b 602 SboxDouble 603 604 eor v7.16b,v7.16b,rk0.16b 605 eor v11.16b,v11.16b,rk1.16b 606.endm 607 608 609.macro Sm412blks kptr 610 ldp wtmp0,wtmp1,[\kptr],8 611 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 612 dup rk0.4s,wtmp0 613 eor rka.16b,v6.16b,v7.16b 614 eor rkb.16b,v10.16b,v11.16b 615 eor rkc.16b,v18.16b,v19.16b 616 eor v0.16b,v5.16b,rk0.16b 617 eor v1.16b,v9.16b,rk0.16b 618 eor v2.16b,v17.16b,rk0.16b 619 eor rk0.16b,rka.16b,v0.16b 620 eor rk1.16b,rkb.16b,v1.16b 621 eor rk2.16b,rkc.16b,v2.16b 622 623 SboxThree rk0.16b, rk1.16b, rk2.16b 624 625 eor v4.16b,v4.16b,rk0.16b 626 eor v8.16b,v8.16b,rk1.16b 627 eor v16.16b,v16.16b,rk2.16b 628 629 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 630 dup rk1.4s,wtmp1 631 eor rka.16b,rka.16b,v4.16b 632 eor rkb.16b,rkb.16b,v8.16b 633 eor rkc.16b,rkc.16b,v16.16b 634 eor rk0.16b,rka.16b,rk1.16b 635 eor rk2.16b,rkc.16b,rk1.16b 636 eor rk1.16b,rkb.16b,rk1.16b 637 638 SboxThree rk0.16b, rk1.16b, rk2.16b 639 640 ldp wtmp0,wtmp1,[\kptr],8 641 eor v5.16b,v5.16b,rk0.16b 642 eor v9.16b,v9.16b,rk1.16b 643 eor v17.16b,v17.16b,rk2.16b 644 645 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 646 dup rk0.4s,wtmp0 647 eor rka.16b,v4.16b,v5.16b 648 eor rkb.16b,v8.16b,v9.16b 649 eor rkc.16b,v16.16b,v17.16b 650 eor v0.16b,v7.16b,rk0.16b 651 eor v1.16b,v11.16b,rk0.16b 652 eor v2.16b,v19.16b,rk0.16b 653 eor rk0.16b,rka.16b,v0.16b 654 eor rk1.16b,rkb.16b,v1.16b 655 eor rk2.16b,rkc.16b,v2.16b 656 657 SboxThree rk0.16b, rk1.16b, rk2.16b 658 659 eor v6.16b,v6.16b,rk0.16b 660 eor v10.16b,v10.16b,rk1.16b 661 eor v18.16b,v18.16b,rk2.16b 662 663 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 664 dup rk1.4s,wtmp1 665 eor rka.16b,rka.16b,v6.16b 666 eor rkb.16b,rkb.16b,v10.16b 667 eor rkc.16b,rkc.16b,v18.16b 668 eor rk0.16b,rka.16b,rk1.16b 669 eor rk2.16b,rkc.16b,rk1.16b 670 eor rk1.16b,rkb.16b,rk1.16b 671 672 SboxThree rk0.16b, rk1.16b, rk2.16b 673 674 eor v7.16b,v7.16b,rk0.16b 675 eor v11.16b,v11.16b,rk1.16b 676 eor v19.16b,v19.16b,rk2.16b 677.endm 678 679 680.macro Encrypt8blks 681 mov ptr, rks 682 mov counter, #8 68310: 684 Sm48blks ptr 685 686 subs counter, counter,#1 687 b.ne 10b 688#ifndef HITLS_BIG_ENDIAN 689 rev32 v3.16b,v4.16b 690 rev32 v2.16b,v5.16b 691 rev32 v1.16b,v6.16b 692 rev32 v0.16b,v7.16b 693 rev32 v7.16b,v8.16b 694 rev32 v6.16b,v9.16b 695 rev32 v5.16b,v10.16b 696 rev32 v4.16b,v11.16b 697#else 698 mov v3.16b,v4.16b 699 mov v2.16b,v5.16b 700 mov v1.16b,v6.16b 701 mov v0.16b,v7.16b 702 mov v7.16b,v8.16b 703 mov v6.16b,v9.16b 704 mov v5.16b,v10.16b 705 mov v4.16b,v11.16b 706#endif 707.endm 708 709.macro Encrypt12blks 710 mov ptr, rks 711 mov counter, #8 71210: 713 Sm412blks ptr 714 715 subs counter,counter,#1 716 b.ne 10b 717 // last reverse transform 718#ifndef HITLS_BIG_ENDIAN 719 rev32 v3.16b,v4.16b 720 rev32 v2.16b,v5.16b 721 rev32 v1.16b,v6.16b 722 rev32 v0.16b,v7.16b 723 724 rev32 v7.16b,v8.16b 725 rev32 v6.16b,v9.16b 726 rev32 v5.16b,v10.16b 727 rev32 v4.16b,v11.16b 728 729 rev32 v11.16b,v16.16b 730 rev32 v10.16b,v17.16b 731 rev32 v9.16b,v18.16b 732 rev32 v8.16b,v19.16b 733#else 734 mov v3.16b,v4.16b 735 mov v2.16b,v5.16b 736 mov v1.16b,v6.16b 737 mov v0.16b,v7.16b 738 739 mov v7.16b,v8.16b 740 mov v6.16b,v9.16b 741 mov v5.16b,v10.16b 742 mov v4.16b,v11.16b 743 744 mov v11.16b,v16.16b 745 mov v10.16b,v17.16b 746 mov v9.16b,v18.16b 747 mov v8.16b,v19.16b 748#endif 749.endm 750 751.text 752.type Sm4Enc4blks,%function 753.align 4 754Sm4Enc4blks: 755AARCH64_PACIASP 756 Encrypt4blks 757AARCH64_AUTIASP 758 ret 759.size Sm4Enc4blks,.-Sm4Enc4blks 760 761.type Sm4Enc8blks,%function 762.align 4 763Sm4Enc8blks: 764AARCH64_PACIASP 765 Encrypt8blks 766AARCH64_AUTIASP 767 ret 768.size Sm4Enc8blks,.-Sm4Enc8blks 769 770.type Sm4Enc12blks,%function 771.align 4 772Sm4Enc12blks: 773AARCH64_PACIASP 774 Encrypt12blks 775AARCH64_AUTIASP 776 ret 777.size Sm4Enc12blks,.-Sm4Enc12blks 778 779.globl Vpsm4EcbEncrypt 780.type Vpsm4EcbEncrypt,%function 781.align 5 782Vpsm4EcbEncrypt: 783AARCH64_PACIASP 784 // convert length into blocks 785 lsr x2,x2,4 786 stp d8,d9,[sp,#-112]! 787 stp d10,d11,[sp,#16] 788 stp d12,d13,[sp,#32] 789 stp d14,d15,[sp,#48] 790 stp x29,x30,[sp,#64] 791 stp x19,x20,[sp,#80] 792 stp x21,x22,[sp,#96] 793 LoadSbox 794 795.Lecb_12_blocks_process: 796 cmp blocks,#12 797 b.lt .Lecb_8_blocks_process 798 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[inp],#64 799 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[inp],#64 800 ld4 {v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64 801 802#ifndef HITLS_BIG_ENDIAN 803 rev32 v4.16b,v4.16b 804 rev32 v5.16b,v5.16b 805 rev32 v6.16b,v6.16b 806 rev32 v7.16b,v7.16b 807 808 rev32 v8.16b,v8.16b 809 rev32 v9.16b,v9.16b 810 rev32 v10.16b,v10.16b 811 rev32 v11.16b,v11.16b 812 813 rev32 v16.16b,v16.16b 814 rev32 v17.16b,v17.16b 815 rev32 v18.16b,v18.16b 816 rev32 v19.16b,v19.16b 817#endif 818 819 bl Sm4Enc12blks 820 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64 821 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64 822 st4 {v8.4s,v9.4s,v10.4s,v11.4s},[outp],#64 823 subs blocks,blocks,#12 824 b.gt .Lecb_12_blocks_process 825 b 100f 826 827.Lecb_8_blocks_process: 828 cmp blocks, #8 829 b.lt .Lecb_4_blocks_process 830 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 831 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 832#ifndef HITLS_BIG_ENDIAN 833 rev32 v4.16b,v4.16b 834 rev32 v5.16b,v5.16b 835 rev32 v6.16b,v6.16b 836 rev32 v7.16b,v7.16b 837 rev32 v8.16b,v8.16b 838 rev32 v9.16b,v9.16b 839 rev32 v10.16b,v10.16b 840 rev32 v11.16b,v11.16b 841#endif 842 bl Sm4Enc8blks 843 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 844 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 845 subs blocks,blocks,#8 846 b.gt .Lecb_8_blocks_process 847 b 100f 848.Lecb_4_blocks_process: 849 cmp blocks,#4 850 b.lt 1f 851 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 852#ifndef HITLS_BIG_ENDIAN 853 rev32 v4.16b, v4.16b 854 rev32 v5.16b, v5.16b 855 rev32 v6.16b, v6.16b 856 rev32 v7.16b, v7.16b 857#endif 858 bl Sm4Enc4blks 859 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 860 sub blocks,blocks,#4 8611: 862 // process last block 863 cmp blocks,#1 864 b.lt 100f 865 b.gt 1f 866 867 adrp x19, .Ltbox1 868 add x19,x19,:lo12:.Ltbox1 869 adrp x20, .Ltbox2 870 add x20,x20,:lo12:.Ltbox2 871 adrp x21, .Ltbox3 872 add x21,x21,:lo12:.Ltbox3 873 adrp x22, .Ltbox4 874 add x22,x22,:lo12:.Ltbox4 875 876 ldp w8,w9,[inp],#8 877 ldp w10,w11,[inp],#8 878#ifndef HITLS_BIG_ENDIAN 879 rev w8,w8 880 rev w9,w9 881 rev w10,w10 882 rev w11,w11 883#endif 884 EncRound 885#ifndef HITLS_BIG_ENDIAN 886 rev w8,w8 887 rev w9,w9 888 rev w10,w10 889 rev w11,w11 890#endif 891 stp w11,w10,[outp] 892 stp w9,w8,[outp,#8] 893 b 100f 8941: // process last 2 blocks 895 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[inp],#16 896 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[inp],#16 897 cmp blocks,#2 898 b.gt 1f 899#ifndef HITLS_BIG_ENDIAN 900 rev32 v4.16b,v4.16b 901 rev32 v5.16b,v5.16b 902 rev32 v6.16b,v6.16b 903 rev32 v7.16b,v7.16b 904#endif 905 bl Sm4Enc4blks 906 st4 {v0.s,v1.s,v2.s,v3.s}[0],[outp],#16 907 st4 {v0.s,v1.s,v2.s,v3.s}[1],[outp] 908 b 100f // 9091: // process last 3 blocks 910 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[inp],#16 911#ifndef HITLS_BIG_ENDIAN 912 rev32 v4.16b,v4.16b 913 rev32 v5.16b,v5.16b 914 rev32 v6.16b,v6.16b 915 rev32 v7.16b,v7.16b 916#endif 917 bl Sm4Enc4blks 918 st4 {v0.s,v1.s,v2.s,v3.s}[0],[outp],#16 919 st4 {v0.s,v1.s,v2.s,v3.s}[1],[outp],#16 920 st4 {v0.s,v1.s,v2.s,v3.s}[2],[outp] 921100: 922 ldp d10,d11,[sp,#16] 923 ldp d12,d13,[sp,#32] 924 ldp d14,d15,[sp,#48] 925 ldp x29,x30,[sp,#64] 926 ldp x19,x20,[sp,#80] 927 ldp x21,x22,[sp,#96] 928 ldp d8,d9,[sp],#112 929AARCH64_AUTIASP 930 ret 931.size Vpsm4EcbEncrypt,.-Vpsm4EcbEncrypt 932 933 934.globl Vpsm4CbcEncrypt 935.type Vpsm4CbcEncrypt,%function 936.align 5 937Vpsm4CbcEncrypt: 938AARCH64_PACIASP 939 lsr len,len,4 940 stp x29,x30,[sp,#-48]! 941 stp x19,x20,[sp,#16] 942 stp x21,x22,[sp,#32] 943 944 // load tbox 945 adrp x19, .Ltbox1 946 add x19,x19,:lo12:.Ltbox1 947 adrp x20, .Ltbox2 948 add x20,x20,:lo12:.Ltbox2 949 adrp x21, .Ltbox3 950 add x21,x21,:lo12:.Ltbox3 951 adrp x22, .Ltbox4 952 add x22,x22,:lo12:.Ltbox4 953 954 cbz w5,.Ldec 955 956 // load iv 957 ldp w8,w9,[ivp] 958 ldp w10,w11,[ivp,#8] 959.Lcbc_1_block_enc: 960 subs blocks,blocks,#1 961 b.lt 2f 962 ldp w6,w7,[inp],#8 963 ldp w16,w17,[inp],#8 964 eor w8,w8,w6 965 eor w9,w9,w7 966 eor w10,w10,w16 967 eor w11,w11,w17 968#ifndef HITLS_BIG_ENDIAN 969 rev w8,w8 970 rev w9,w9 971 rev w10,w10 972 rev w11,w11 973#endif 974 EncRound 975#ifndef HITLS_BIG_ENDIAN 976 rev w8,w8 977 rev w9,w9 978 rev w10,w10 979 rev w11,w11 980#endif 981 // reverse to store 982 mov w6,w8 983 mov w8,w11 984 mov w11,w6 985 mov w7,w9 986 mov w9,w10 987 mov w10,w7 988 989 stp w8,w9,[outp],#8 990 stp w10,w11,[outp],#8 991 b .Lcbc_1_block_enc 9922: 993 // save back IV 994 stp w8,w9,[ivp] 995 stp w10,w11,[ivp,#8] 996 997 ldp x19,x20,[sp,#16] 998 ldp x21,x22,[sp,#32] 999 ldp x29,x30,[sp],#48 1000AARCH64_AUTIASP 1001 ret 1002 1003.Ldec: 1004 LoadSbox 1005 // decryption mode starts 1006 stp d8,d9,[sp,#-64]! 1007 stp d10,d11,[sp,#16] 1008 stp d12,d13,[sp,#32] 1009 stp d14,d15,[sp,#48] 1010 1011.Lcbc_12_blocks_dec: 1012 cmp w2,#12 1013 b.lt .Lcbc_8_blocks_dec 1014 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 1015 add x10,x0,#64 1016 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10] 1017 add x10,x10,#64 1018 ld4 {v16.4s,v17.4s,v18.4s,v19.4s},[x10] 1019 1020#ifndef HITLS_BIG_ENDIAN 1021 rev32 v4.16b,v4.16b 1022 rev32 v5.16b,v5.16b 1023 rev32 v6.16b,v6.16b 1024 rev32 v7.16b,v7.16b 1025 rev32 v8.16b,v8.16b 1026 rev32 v9.16b,v9.16b 1027 rev32 v10.16b,v10.16b 1028 rev32 v11.16b,v11.16b 1029 rev32 v16.16b,v16.16b 1030 rev32 v17.16b,v17.16b 1031 rev32 v18.16b,v18.16b 1032 rev32 v19.16b,v19.16b 1033#endif 1034 bl Sm4Enc12blks 1035 // transpose to xor iv 1036 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d 1037 transpose v4.4s,v5.4s,v6.4s,v7.4s,v4.2d,v5.2d,v6.2d,v7.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d 1038 transpose v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d 1039 ld1 {ivec1.4s},[ivp] 1040 ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64 1041 eor v0.16b,v0.16b,ivec1.16b 1042 ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64 1043 eor v1.16b,v1.16b,v16.16b 1044 eor v2.16b,v2.16b,v17.16b 1045 eor v3.16b,v3.16b,v18.16b 1046 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64 1047 1048 eor v4.16b,v4.16b,v19.16b 1049 eor v5.16b,v5.16b,v12.16b 1050 eor v6.16b,v6.16b,v13.16b 1051 eor v7.16b,v7.16b,v14.16b 1052 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64 1053 1054 ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64 1055 eor v8.16b,v8.16b,v15.16b 1056 eor v9.16b,v9.16b,v16.16b 1057 eor v10.16b,v10.16b,v17.16b 1058 eor v11.16b,v11.16b,v18.16b 1059 st1 {v8.4s,v9.4s,v10.4s,v11.4s},[outp],#64 1060 // save back iv 1061 st1 {v19.4s}, [ivp] 1062 1063 subs blocks,blocks,#12 1064 b.gt .Lcbc_12_blocks_dec 1065 b 100f 1066 1067.Lcbc_8_blocks_dec: 1068 cmp blocks,#8 1069 b.lt 1f 1070 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[inp] 1071 add ptr, inp, #64 1072 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[ptr] 1073 1074#ifndef HITLS_BIG_ENDIAN 1075 rev32 v4.16b,v4.16b 1076 rev32 v5.16b,v5.16b 1077 rev32 v6.16b,v6.16b 1078 rev32 v7.16b,v7.16b 1079 rev32 v8.16b,v8.16b 1080 rev32 v9.16b,v9.16b 1081 rev32 v10.16b,v10.16b 1082 rev32 v11.16b,v11.16b 1083#endif 1084 bl Sm4Enc8blks 1085 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 1086 transpose v4.4s,v5.4s,v6.4s,v7.4s,v4.2d,v5.2d,v6.2d,v7.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 1087 ld1 {ivec1.4s},[ivp] 1088 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[inp],#64 1089 // note ivec1 and v15 are resuing the same register 1090 // care needs to be taken to avoid conflict 1091 eor v0.16b,v0.16b,ivec1.16b 1092 ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64 1093 eor v1.16b,v1.16b,v8.16b 1094 eor v2.16b,v2.16b,v9.16b 1095 eor v3.16b,v3.16b,v10.16b 1096 // save back IV 1097 st1 {v15.4s}, [ivp] 1098 eor v4.16b,v4.16b,v11.16b 1099 eor v5.16b,v5.16b,v12.16b 1100 eor v6.16b,v6.16b,v13.16b 1101 eor v7.16b,v7.16b,v14.16b 1102 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64 1103 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64 1104 subs blocks,blocks,#8 1105 b.gt .Lcbc_8_blocks_dec 1106 b.eq 100f 11071: 1108 ld1 {ivec1.4s},[ivp] 1109.Lcbc_4_blocks_dec: 1110 cmp blocks,#4 1111 b.lt 1f 1112 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[inp] 1113#ifndef HITLS_BIG_ENDIAN 1114 rev32 v4.16b,v4.16b 1115 rev32 v5.16b,v5.16b 1116 rev32 v6.16b,v6.16b 1117 rev32 v7.16b,v7.16b 1118#endif 1119 bl Sm4Enc4blks 1120 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[inp],#64 1121 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 1122 eor v0.16b,v0.16b,ivec1.16b 1123 eor v1.16b,v1.16b,v4.16b 1124 orr v15.16b,v7.16b,v7.16b 1125 eor v2.16b,v2.16b,v5.16b 1126 eor v3.16b,v3.16b,v6.16b 1127 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64 1128 // save back IV 1129 st1 {v7.4s}, [ivp] 1130 subs blocks,blocks,#4 1131 b.gt .Lcbc_4_blocks_dec 1132 b 100f 11331: // last block 1134 subs blocks,blocks,#1 1135 b.lt 100f 1136 b.gt 1f 1137 // load iv 1138 ldp w6,w7,[ivp] 1139 ldp w16,w17,[ivp,#8] 1140 1141 ldp w8,w9,[inp] 1142 ldp w10,w11,[inp,#8] 1143 // store back iv 1144 stp w8,w9,[ivp] 1145 stp w10,w11,[ivp,#8] 1146#ifndef HITLS_BIG_ENDIAN 1147 rev w8,w8 1148 rev w9,w9 1149 rev w10,w10 1150 rev w11,w11 1151#endif 1152 EncRound 1153#ifndef HITLS_BIG_ENDIAN 1154 rev w8,w8 1155 rev w9,w9 1156 rev w10,w10 1157 rev w11,w11 1158#endif 1159 eor w11,w11,w6 1160 eor w10,w10,w7 1161 eor w9,w9,w16 1162 eor w8,w8,w17 1163 stp w11,w10,[outp],#8 1164 stp w9,w8,[outp],#8 1165 b 100f 11661: // last two blocks 1167 ld4 {v4.s,v5.s,v6.s,v7.s}[0], [inp] 1168 add ptr,inp,#16 1169 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[ptr],#16 1170 subs blocks,blocks,1 1171 b.gt 1f 1172#ifndef HITLS_BIG_ENDIAN 1173 rev32 v4.16b,v4.16b 1174 rev32 v5.16b,v5.16b 1175 rev32 v6.16b,v6.16b 1176 rev32 v7.16b,v7.16b 1177#endif 1178 bl Sm4Enc4blks 1179 ld1 {v4.4s,v5.4s},[inp],#32 1180 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 1181 eor v0.16b,v0.16b,ivec1.16b 1182 eor v1.16b,v1.16b,v4.16b 1183 st1 {v0.4s,v1.4s},[outp],#32 1184 // save back IV 1185 st1 {v5.4s}, [ivp] 1186 b 100f 11871: // last 3 blocks 1188 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[ptr] 1189#ifndef HITLS_BIG_ENDIAN 1190 rev32 v4.16b,v4.16b 1191 rev32 v5.16b,v5.16b 1192 rev32 v6.16b,v6.16b 1193 rev32 v7.16b,v7.16b 1194#endif 1195 bl Sm4Enc4blks 1196 ld1 {v4.4s,v5.4s,v6.4s},[inp],#48 1197 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 1198 eor v0.16b,v0.16b,ivec1.16b 1199 eor v1.16b,v1.16b,v4.16b 1200 eor v2.16b,v2.16b,v5.16b 1201 st1 {v0.4s,v1.4s,v2.4s},[outp],#48 1202 // save back IV 1203 st1 {v6.4s}, [ivp] 1204100: 1205 ldp d10,d11,[sp,#16] 1206 ldp d12,d13,[sp,#32] 1207 ldp d14,d15,[sp,#48] 1208 ldp d8,d9,[sp],#64 1209 ldp x19,x20,[sp,#16] 1210 ldp x21,x22,[sp,#32] 1211 ldp x29,x30,[sp],#48 1212AARCH64_AUTIASP 1213 ret 1214.size Vpsm4CbcEncrypt,.-Vpsm4CbcEncrypt 1215 1216 1217# void Vpsm4Ctr32EncryptBlocks(const uint8_t *in, uint8_t *out, uint64_t blocks, const uint32_t *key, uint8_t *iv); 1218.globl Vpsm4Ctr32EncryptBlocks 1219.type Vpsm4Ctr32EncryptBlocks,%function 1220.align 5 1221Vpsm4Ctr32EncryptBlocks: 1222AARCH64_PACIASP 1223 ld1 {ivec.4s},[ivp] 1224#ifndef HITLS_BIG_ENDIAN 1225 rev32 v3.16b,v3.16b 1226#endif 1227 LoadSbox 1228 cmp blocks,#1 1229 b.ne 1f 1230 // fast processing for one single block without 1231 // context saving overhead 1232 stp x19,x20,[sp,#-32]! 1233 stp x21,x22,[sp,#16] 1234 adrp x19, .Ltbox1 1235 add x19,x19,:lo12:.Ltbox1 1236 adrp x20, .Ltbox2 1237 add x20,x20,:lo12:.Ltbox2 1238 adrp x21, .Ltbox3 1239 add x21,x21,:lo12:.Ltbox3 1240 adrp x22, .Ltbox4 1241 add x22,x22,:lo12:.Ltbox4 1242 1243 Encrypt1blkNorevCtr 1244 1245 ld1 {v4.4s},[inp] 1246 eor v4.16b,v4.16b,ivec.16b 1247 st1 {v4.4s},[outp] 1248 ldp x21,x22,[sp,#16] 1249 ldp x19,x20,[sp],#32 1250 ldr ctr,[ivp,#12] 1251#ifndef HITLS_BIG_ENDIAN 1252 rev ctr,ctr 1253#endif 1254 add ctr,ctr,#1 1255#ifndef HITLS_BIG_ENDIAN 1256 rev ctr,ctr 1257#endif 1258 str ctr,[ivp,#12] 1259AARCH64_AUTIASP 1260 ret 12611: 1262 stp d8,d9,[sp,#-112]! 1263 stp d10,d11,[sp,#16] 1264 stp d12,d13,[sp,#32] 1265 stp d14,d15,[sp,#48] 1266 stp x29,x30,[sp,#64] 1267 stp x19,x20,[sp,#80] 1268 stp x21,x22,[sp,#96] 1269 mov word0, ivec.s[0] 1270 mov word1, ivec.s[1] 1271 mov word2, ivec.s[2] 1272 mov ctr, ivec.s[3] 1273.Lctr32_4_blocks_process: 1274 cmp blocks,#4 1275 b.lt 1f 1276 dup v4.4s,word0 1277 dup v5.4s,word1 1278 dup v6.4s,word2 1279 mov v7.s[0],w5 1280 add ctr,ctr,#1 1281 mov v7.s[1],ctr 1282 add ctr,ctr,#1 1283 mov v7.s[2],ctr 1284 add ctr,ctr,#1 1285 mov v7.s[3],ctr 1286 add ctr,ctr,#1 1287 cmp blocks,#8 1288 b.ge .Lctr32_8_blocks_process 1289 bl Sm4Enc4blks 1290 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64 1291 eor v0.16b,v0.16b,v12.16b 1292 eor v1.16b,v1.16b,v13.16b 1293 eor v2.16b,v2.16b,v14.16b 1294 eor v3.16b,v3.16b,v15.16b 1295 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64 1296 subs blocks,blocks,#4 1297 b.ne .Lctr32_4_blocks_process 1298 b 100f 1299.Lctr32_8_blocks_process: 1300 dup v8.4s,word0 1301 dup v9.4s,word1 1302 dup v10.4s,word2 1303 mov v11.s[0],ctr 1304 add ctr,ctr,#1 1305 mov v11.s[1],ctr 1306 add ctr,ctr,#1 1307 mov v11.s[2],ctr 1308 add ctr,ctr,#1 1309 mov v11.s[3],ctr 1310 add ctr,ctr,#1 1311 cmp blocks,#12 1312 b.ge .Lctr32_12_blocks_process 1313 bl Sm4Enc8blks 1314 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64 1315 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[inp],#64 1316 eor v0.16b,v0.16b,v12.16b 1317 eor v1.16b,v1.16b,v13.16b 1318 eor v2.16b,v2.16b,v14.16b 1319 eor v3.16b,v3.16b,v15.16b 1320 eor v4.16b,v4.16b,v8.16b 1321 eor v5.16b,v5.16b,v9.16b 1322 eor v6.16b,v6.16b,v10.16b 1323 eor v7.16b,v7.16b,v11.16b 1324 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64 1325 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64 1326 subs blocks,blocks,#8 1327 b.ne .Lctr32_4_blocks_process 1328 b 100f 1329.Lctr32_12_blocks_process: 1330 dup v16.4s,word0 1331 dup v17.4s,word1 1332 dup v18.4s,word2 1333 mov v19.s[0],ctr 1334 add ctr,ctr,#1 1335 mov v19.s[1],ctr 1336 add ctr,ctr,#1 1337 mov v19.s[2],ctr 1338 add ctr,ctr,#1 1339 mov v19.s[3],ctr 1340 add ctr,ctr,#1 1341 bl Sm4Enc12blks 1342 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64 1343 eor v0.16b,v0.16b,v12.16b 1344 eor v1.16b,v1.16b,v13.16b 1345 eor v2.16b,v2.16b,v14.16b 1346 eor v3.16b,v3.16b,v15.16b 1347 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[outp],#64 1348 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64 1349 eor v4.16b,v4.16b,v12.16b 1350 eor v5.16b,v5.16b,v13.16b 1351 eor v6.16b,v6.16b,v14.16b 1352 eor v7.16b,v7.16b,v15.16b 1353 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64 1354 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64 1355 eor v8.16b,v8.16b,v12.16b 1356 eor v9.16b,v9.16b,v13.16b 1357 eor v10.16b,v10.16b,v14.16b 1358 eor v11.16b,v11.16b,v15.16b 1359 st4 {v8.4s,v9.4s,v10.4s,v11.4s},[outp],#64 1360 subs blocks,blocks,#12 1361 b.ne .Lctr32_4_blocks_process 1362 b 100f 1363 13641: // last block processing 1365 subs blocks,blocks,#1 1366 b.lt 100f 1367 b.gt 1f 1368 mov ivec.s[0],word0 1369 mov ivec.s[1],word1 1370 mov ivec.s[2],word2 1371 mov ivec.s[3],ctr 1372 add ctr,ctr,#1 1373 1374 adrp x19, .Ltbox1 1375 add x19,x19,:lo12:.Ltbox1 1376 adrp x20, .Ltbox2 1377 add x20,x20,:lo12:.Ltbox2 1378 adrp x21, .Ltbox3 1379 add x21,x21,:lo12:.Ltbox3 1380 adrp x22, .Ltbox4 1381 add x22,x22,:lo12:.Ltbox4 1382 1383 Encrypt1blkNorevCtr 1384 1385 ld1 {v4.4s},[inp] 1386 eor v4.16b,v4.16b,ivec.16b 1387 st1 {v4.4s},[outp] 1388 b 100f 1389 13901: // last 2 blocks processing 1391 1392 dup v4.4s,word0 1393 dup v5.4s,word1 1394 dup v6.4s,word2 1395 mov v7.s[0],ctr 1396 add ctr,ctr,#1 1397 mov v7.s[1],ctr 1398 subs blocks,blocks,#1 1399 b.ne 1f 1400 add ctr,ctr,#1 1401 bl Sm4Enc4blks 1402 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[inp],#16 1403 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[inp],#16 1404 eor v0.16b,v0.16b,v12.16b 1405 eor v1.16b,v1.16b,v13.16b 1406 eor v2.16b,v2.16b,v14.16b 1407 eor v3.16b,v3.16b,v15.16b 1408 st4 {v0.s,v1.s,v2.s,v3.s}[0],[outp],#16 1409 st4 {v0.s,v1.s,v2.s,v3.s}[1],[outp],#16 1410 b 100f 1411 14121: // last 3 blocks processing 1413 add ctr,ctr,#1 1414 mov v7.s[2],ctr 1415 add ctr,ctr,#1 1416 bl Sm4Enc4blks 1417 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[inp],#16 1418 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[inp],#16 1419 ld4 {v12.s,v13.s,v14.s,v15.s}[2],[inp],#16 1420 eor v0.16b,v0.16b,v12.16b 1421 eor v1.16b,v1.16b,v13.16b 1422 eor v2.16b,v2.16b,v14.16b 1423 eor v3.16b,v3.16b,v15.16b 1424 st4 {v0.s,v1.s,v2.s,v3.s}[0],[outp],#16 1425 st4 {v0.s,v1.s,v2.s,v3.s}[1],[outp],#16 1426 st4 {v0.s,v1.s,v2.s,v3.s}[2],[outp],#16 1427100: 1428 ldp d10,d11,[sp,#16] 1429 ldp d12,d13,[sp,#32] 1430 ldp d14,d15,[sp,#48] 1431 ldp x29,x30,[sp,#64] 1432 ldp x19,x20,[sp,#80] 1433 ldp x21,x22,[sp,#96] 1434 ldp d8,d9,[sp],#112 1435#ifndef HITLS_BIG_ENDIAN 1436 rev ctr, ctr 1437#endif 1438 str ctr, [ivp,#12] 1439AARCH64_AUTIASP 1440 ret 1441.size Vpsm4Ctr32EncryptBlocks,.-Vpsm4Ctr32EncryptBlocks 1442 1443.globl Vpsm4XtsCipher 1444.type Vpsm4XtsCipher,%function 1445.align 5 1446Vpsm4XtsCipher: 1447AARCH64_PACIASP 1448 stp x19, x20, [sp, #-0x10]! 1449 stp x21, x22, [sp, #-0x10]! 1450 stp x23, x24, [sp, #-0x10]! 1451 stp x25, x26, [sp, #-0x10]! 1452 stp x27, x28, [sp, #-0x10]! 1453 stp x29, x30, [sp, #-0x10]! 1454 stp d8, d9, [sp, #-0x10]! 1455 stp d10, d11, [sp, #-0x10]! 1456 stp d12, d13, [sp, #-0x10]! 1457 stp d14, d15, [sp, #-0x10]! 1458 sub sp, sp, #192 1459 mov x24, sp 1460 mov x26,x3 1461 mov x27,x4 1462 mov w28,w6 1463 ld1 {v16.4s}, [x5] 1464 LoadSbox 1465 1466 and x29,x2,#0x0F 1467 // convert length into blocks 1468 lsr x2,x2,4 1469 cmp x2,#1 1470 b.lt .Lxts_cipher_return 1471 1472 cmp x29,0 1473 // If the encryption/decryption Length is N times of 16, 1474 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks 1475 b.eq .xts_encrypt_blocks 1476 1477 // If the encryption/decryption length is not N times of 16, 1478 // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak 1479 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks 1480 subs x2,x2,#1 1481 b.eq .only_2blks_tweak 1482.xts_encrypt_blocks: 1483 rbit v16.16b,v16.16b 1484#ifdef HITLS_BIG_ENDIAN 1485 rev32 v16.16b,v16.16b 1486#endif 1487 mov x12,v16.d[0] 1488 mov x13,v16.d[1] 1489 mov w7,0x87 1490 extr x9,x13,x13,#32 1491 extr x15,x13,x12,#63 1492 and w8,w7,w9,asr#31 1493 eor x14,x8,x12,lsl#1 1494 mov w7,0x87 1495 extr x9,x15,x15,#32 1496 extr x17,x15,x14,#63 1497 and w8,w7,w9,asr#31 1498 eor x16,x8,x14,lsl#1 1499 mov w7,0x87 1500 extr x9,x17,x17,#32 1501 extr x19,x17,x16,#63 1502 and w8,w7,w9,asr#31 1503 eor x18,x8,x16,lsl#1 1504.Lxts_12_blocks_process: 1505 mov x24, sp 1506 cmp x2,#12 1507 b.lt .Lxts_8_blocks_process 1508 mov v16.d[0],x12 1509 mov v16.d[1],x13 1510#ifdef HITLS_BIG_ENDIAN 1511 rev32 v16.16b,v16.16b 1512#endif 1513 mov w7,0x87 1514 extr x9,x19,x19,#32 1515 extr x13,x19,x18,#63 1516 and w8,w7,w9,asr#31 1517 eor x12,x8,x18,lsl#1 1518 mov v17.d[0],x14 1519 mov v17.d[1],x15 1520#ifdef HITLS_BIG_ENDIAN 1521 rev32 v17.16b,v17.16b 1522#endif 1523 mov w7,0x87 1524 extr x9,x13,x13,#32 1525 extr x15,x13,x12,#63 1526 and w8,w7,w9,asr#31 1527 eor x14,x8,x12,lsl#1 1528 mov v18.d[0],x16 1529 mov v18.d[1],x17 1530#ifdef HITLS_BIG_ENDIAN 1531 rev32 v18.16b,v18.16b 1532#endif 1533 mov w7,0x87 1534 extr x9,x15,x15,#32 1535 extr x17,x15,x14,#63 1536 and w8,w7,w9,asr#31 1537 eor x16,x8,x14,lsl#1 1538 mov v19.d[0],x18 1539 mov v19.d[1],x19 1540#ifdef HITLS_BIG_ENDIAN 1541 rev32 v19.16b,v19.16b 1542#endif 1543 mov w7,0x87 1544 extr x9,x17,x17,#32 1545 extr x19,x17,x16,#63 1546 and w8,w7,w9,asr#31 1547 eor x18,x8,x16,lsl#1 1548 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1549 rbit v16.16b,v16.16b 1550 rbit v17.16b,v17.16b 1551 rbit v18.16b,v18.16b 1552 rbit v19.16b,v19.16b 1553 eor v4.16b, v4.16b, v16.16b 1554 eor v5.16b, v5.16b, v17.16b 1555 eor v6.16b, v6.16b, v18.16b 1556 eor v7.16b, v7.16b, v19.16b 1557 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 1558 st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64 1559 mov v16.d[0],x12 1560 mov v16.d[1],x13 1561#ifdef HITLS_BIG_ENDIAN 1562 rev32 v16.16b,v16.16b 1563#endif 1564 mov w7,0x87 1565 extr x9,x19,x19,#32 1566 extr x13,x19,x18,#63 1567 and w8,w7,w9,asr#31 1568 eor x12,x8,x18,lsl#1 1569 mov v17.d[0],x14 1570 mov v17.d[1],x15 1571#ifdef HITLS_BIG_ENDIAN 1572 rev32 v17.16b,v17.16b 1573#endif 1574 mov w7,0x87 1575 extr x9,x13,x13,#32 1576 extr x15,x13,x12,#63 1577 and w8,w7,w9,asr#31 1578 eor x14,x8,x12,lsl#1 1579 mov v18.d[0],x16 1580 mov v18.d[1],x17 1581#ifdef HITLS_BIG_ENDIAN 1582 rev32 v18.16b,v18.16b 1583#endif 1584 mov w7,0x87 1585 extr x9,x15,x15,#32 1586 extr x17,x15,x14,#63 1587 and w8,w7,w9,asr#31 1588 eor x16,x8,x14,lsl#1 1589 mov v19.d[0],x18 1590 mov v19.d[1],x19 1591#ifdef HITLS_BIG_ENDIAN 1592 rev32 v19.16b,v19.16b 1593#endif 1594 mov w7,0x87 1595 extr x9,x17,x17,#32 1596 extr x19,x17,x16,#63 1597 and w8,w7,w9,asr#31 1598 eor x18,x8,x16,lsl#1 1599 rbit v16.16b,v16.16b 1600 rbit v17.16b,v17.16b 1601 rbit v18.16b,v18.16b 1602 rbit v19.16b,v19.16b 1603 eor v8.16b, v8.16b, v16.16b 1604 eor v9.16b, v9.16b, v17.16b 1605 eor v10.16b, v10.16b, v18.16b 1606 eor v11.16b, v11.16b, v19.16b 1607 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 1608 st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64 1609 mov v16.d[0],x12 1610 mov v16.d[1],x13 1611#ifdef HITLS_BIG_ENDIAN 1612 rev32 v16.16b,v16.16b 1613#endif 1614 mov w7,0x87 1615 extr x9,x19,x19,#32 1616 extr x13,x19,x18,#63 1617 and w8,w7,w9,asr#31 1618 eor x12,x8,x18,lsl#1 1619 mov v17.d[0],x14 1620 mov v17.d[1],x15 1621#ifdef HITLS_BIG_ENDIAN 1622 rev32 v17.16b,v17.16b 1623#endif 1624 mov w7,0x87 1625 extr x9,x13,x13,#32 1626 extr x15,x13,x12,#63 1627 and w8,w7,w9,asr#31 1628 eor x14,x8,x12,lsl#1 1629 mov v18.d[0],x16 1630 mov v18.d[1],x17 1631#ifdef HITLS_BIG_ENDIAN 1632 rev32 v18.16b,v18.16b 1633#endif 1634 mov w7,0x87 1635 extr x9,x15,x15,#32 1636 extr x17,x15,x14,#63 1637 and w8,w7,w9,asr#31 1638 eor x16,x8,x14,lsl#1 1639 mov v19.d[0],x18 1640 mov v19.d[1],x19 1641#ifdef HITLS_BIG_ENDIAN 1642 rev32 v19.16b,v19.16b 1643#endif 1644 mov w7,0x87 1645 extr x9,x17,x17,#32 1646 extr x19,x17,x16,#63 1647 and w8,w7,w9,asr#31 1648 eor x18,x8,x16,lsl#1 1649 rbit v16.16b,v16.16b 1650 rbit v17.16b,v17.16b 1651 rbit v18.16b,v18.16b 1652 rbit v19.16b,v19.16b 1653 eor v0.16b, v0.16b, v16.16b 1654 eor v1.16b, v1.16b, v17.16b 1655 eor v2.16b, v2.16b, v18.16b 1656 eor v3.16b, v3.16b, v19.16b 1657 st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64 1658 mov v16.16b,v0.16b 1659 mov v17.16b,v1.16b 1660 mov v18.16b,v2.16b 1661 mov v19.16b,v3.16b 1662#ifndef HITLS_BIG_ENDIAN 1663 rev32 v4.16b,v4.16b 1664 rev32 v5.16b,v5.16b 1665 rev32 v6.16b,v6.16b 1666 rev32 v7.16b,v7.16b 1667 rev32 v8.16b,v8.16b 1668 rev32 v9.16b,v9.16b 1669 rev32 v10.16b,v10.16b 1670 rev32 v11.16b,v11.16b 1671 rev32 v16.16b,v16.16b 1672 rev32 v17.16b,v17.16b 1673 rev32 v18.16b,v18.16b 1674 rev32 v19.16b,v19.16b 1675#endif 1676 zip1 v0.4s,v4.4s,v5.4s 1677 zip2 v1.4s,v4.4s,v5.4s 1678 zip1 v2.4s,v6.4s,v7.4s 1679 zip2 v3.4s,v6.4s,v7.4s 1680 zip1 v4.2d,v0.2d,v2.2d 1681 zip2 v5.2d,v0.2d,v2.2d 1682 zip1 v6.2d,v1.2d,v3.2d 1683 zip2 v7.2d,v1.2d,v3.2d 1684 zip1 v0.4s,v8.4s,v9.4s 1685 zip2 v1.4s,v8.4s,v9.4s 1686 zip1 v2.4s,v10.4s,v11.4s 1687 zip2 v3.4s,v10.4s,v11.4s 1688 zip1 v8.2d,v0.2d,v2.2d 1689 zip2 v9.2d,v0.2d,v2.2d 1690 zip1 v10.2d,v1.2d,v3.2d 1691 zip2 v11.2d,v1.2d,v3.2d 1692 zip1 v0.4s,v16.4s,v17.4s 1693 zip2 v1.4s,v16.4s,v17.4s 1694 zip1 v2.4s,v18.4s,v19.4s 1695 zip2 v3.4s,v18.4s,v19.4s 1696 zip1 v16.2d,v0.2d,v2.2d 1697 zip2 v17.2d,v0.2d,v2.2d 1698 zip1 v18.2d,v1.2d,v3.2d 1699 zip2 v19.2d,v1.2d,v3.2d 1700 bl Sm4Enc12blks 1701 zip1 v16.4s,v0.4s,v1.4s 1702 zip2 v17.4s,v0.4s,v1.4s 1703 zip1 v18.4s,v2.4s,v3.4s 1704 zip2 v19.4s,v2.4s,v3.4s 1705 zip1 v0.2d,v16.2d,v18.2d 1706 zip2 v1.2d,v16.2d,v18.2d 1707 zip1 v2.2d,v17.2d,v19.2d 1708 zip2 v3.2d,v17.2d,v19.2d 1709 zip1 v16.4s,v4.4s,v5.4s 1710 zip2 v17.4s,v4.4s,v5.4s 1711 zip1 v18.4s,v6.4s,v7.4s 1712 zip2 v19.4s,v6.4s,v7.4s 1713 zip1 v4.2d,v16.2d,v18.2d 1714 zip2 v5.2d,v16.2d,v18.2d 1715 zip1 v6.2d,v17.2d,v19.2d 1716 zip2 v7.2d,v17.2d,v19.2d 1717 zip1 v16.4s,v8.4s,v9.4s 1718 zip2 v17.4s,v8.4s,v9.4s 1719 zip1 v18.4s,v10.4s,v11.4s 1720 zip2 v19.4s,v10.4s,v11.4s 1721 zip1 v8.2d,v16.2d,v18.2d 1722 zip2 v9.2d,v16.2d,v18.2d 1723 zip1 v10.2d,v17.2d,v19.2d 1724 zip2 v11.2d,v17.2d,v19.2d 1725 mov x24, sp 1726 ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64 1727 eor v0.16b, v0.16b, v16.16b 1728 eor v1.16b, v1.16b, v17.16b 1729 eor v2.16b, v2.16b, v18.16b 1730 eor v3.16b, v3.16b, v19.16b 1731 1732 ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64 1733 eor v4.16b, v4.16b, v16.16b 1734 eor v5.16b, v5.16b, v17.16b 1735 eor v6.16b, v6.16b, v18.16b 1736 eor v7.16b, v7.16b, v19.16b 1737 1738 ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64 1739 eor v8.16b, v8.16b, v16.16b 1740 eor v9.16b, v9.16b, v17.16b 1741 eor v10.16b, v10.16b, v18.16b 1742 eor v11.16b, v11.16b, v19.16b 1743 1744 // save the last tweak 1745 mov v24.16b,v19.16b 1746 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 1747 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1748 st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x1],#64 1749 subs x2,x2,#12 1750 b.gt .Lxts_12_blocks_process 1751 b 100f 1752.Lxts_8_blocks_process: 1753 mov x24, sp 1754 cmp x2,#8 1755 mov v16.d[0],x12 1756 mov v16.d[1],x13 1757#ifdef HITLS_BIG_ENDIAN 1758 rev32 v16.16b,v16.16b 1759#endif 1760 mov w7,0x87 1761 extr x9,x19,x19,#32 1762 extr x13,x19,x18,#63 1763 and w8,w7,w9,asr#31 1764 eor x12,x8,x18,lsl#1 1765 mov v17.d[0],x14 1766 mov v17.d[1],x15 1767#ifdef HITLS_BIG_ENDIAN 1768 rev32 v17.16b,v17.16b 1769#endif 1770 mov w7,0x87 1771 extr x9,x13,x13,#32 1772 extr x15,x13,x12,#63 1773 and w8,w7,w9,asr#31 1774 eor x14,x8,x12,lsl#1 1775 mov v18.d[0],x16 1776 mov v18.d[1],x17 1777#ifdef HITLS_BIG_ENDIAN 1778 rev32 v18.16b,v18.16b 1779#endif 1780 mov w7,0x87 1781 extr x9,x15,x15,#32 1782 extr x17,x15,x14,#63 1783 and w8,w7,w9,asr#31 1784 eor x16,x8,x14,lsl#1 1785 mov v19.d[0],x18 1786 mov v19.d[1],x19 1787#ifdef HITLS_BIG_ENDIAN 1788 rev32 v19.16b,v19.16b 1789#endif 1790 mov w7,0x87 1791 extr x9,x17,x17,#32 1792 extr x19,x17,x16,#63 1793 and w8,w7,w9,asr#31 1794 eor x18,x8,x16,lsl#1 1795 b.lt .Lxts_4_blocks_process 1796 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1797 rbit v16.16b,v16.16b 1798 rbit v17.16b,v17.16b 1799 rbit v18.16b,v18.16b 1800 rbit v19.16b,v19.16b 1801 eor v4.16b, v4.16b, v16.16b 1802 eor v5.16b, v5.16b, v17.16b 1803 eor v6.16b, v6.16b, v18.16b 1804 eor v7.16b, v7.16b, v19.16b 1805 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 1806 st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24], #64 1807 mov v16.d[0],x12 1808 mov v16.d[1],x13 1809#ifdef HITLS_BIG_ENDIAN 1810 rev32 v16.16b,v16.16b 1811#endif 1812 mov w7,0x87 1813 extr x9,x19,x19,#32 1814 extr x13,x19,x18,#63 1815 and w8,w7,w9,asr#31 1816 eor x12,x8,x18,lsl#1 1817 mov v17.d[0],x14 1818 mov v17.d[1],x15 1819#ifdef HITLS_BIG_ENDIAN 1820 rev32 v17.16b,v17.16b 1821#endif 1822 mov w7,0x87 1823 extr x9,x13,x13,#32 1824 extr x15,x13,x12,#63 1825 and w8,w7,w9,asr#31 1826 eor x14,x8,x12,lsl#1 1827 mov v18.d[0],x16 1828 mov v18.d[1],x17 1829#ifdef HITLS_BIG_ENDIAN 1830 rev32 v18.16b,v18.16b 1831#endif 1832 mov w7,0x87 1833 extr x9,x15,x15,#32 1834 extr x17,x15,x14,#63 1835 and w8,w7,w9,asr#31 1836 eor x16,x8,x14,lsl#1 1837 mov v19.d[0],x18 1838 mov v19.d[1],x19 1839#ifdef HITLS_BIG_ENDIAN 1840 rev32 v19.16b,v19.16b 1841#endif 1842 mov w7,0x87 1843 extr x9,x17,x17,#32 1844 extr x19,x17,x16,#63 1845 and w8,w7,w9,asr#31 1846 eor x18,x8,x16,lsl#1 1847 rbit v16.16b,v16.16b 1848 rbit v17.16b,v17.16b 1849 rbit v18.16b,v18.16b 1850 rbit v19.16b,v19.16b 1851 eor v8.16b, v8.16b, v16.16b 1852 eor v9.16b, v9.16b, v17.16b 1853 eor v10.16b, v10.16b, v18.16b 1854 eor v11.16b, v11.16b, v19.16b 1855 st1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64 1856#ifndef HITLS_BIG_ENDIAN 1857 rev32 v4.16b,v4.16b 1858 rev32 v5.16b,v5.16b 1859 rev32 v6.16b,v6.16b 1860 rev32 v7.16b,v7.16b 1861 rev32 v8.16b,v8.16b 1862 rev32 v9.16b,v9.16b 1863 rev32 v10.16b,v10.16b 1864 rev32 v11.16b,v11.16b 1865#endif 1866 zip1 v0.4s,v4.4s,v5.4s 1867 zip2 v1.4s,v4.4s,v5.4s 1868 zip1 v2.4s,v6.4s,v7.4s 1869 zip2 v3.4s,v6.4s,v7.4s 1870 zip1 v4.2d,v0.2d,v2.2d 1871 zip2 v5.2d,v0.2d,v2.2d 1872 zip1 v6.2d,v1.2d,v3.2d 1873 zip2 v7.2d,v1.2d,v3.2d 1874 zip1 v0.4s,v8.4s,v9.4s 1875 zip2 v1.4s,v8.4s,v9.4s 1876 zip1 v2.4s,v10.4s,v11.4s 1877 zip2 v3.4s,v10.4s,v11.4s 1878 zip1 v8.2d,v0.2d,v2.2d 1879 zip2 v9.2d,v0.2d,v2.2d 1880 zip1 v10.2d,v1.2d,v3.2d 1881 zip2 v11.2d,v1.2d,v3.2d 1882 bl Sm4Enc8blks 1883 zip1 v8.4s,v0.4s,v1.4s 1884 zip2 v9.4s,v0.4s,v1.4s 1885 zip1 v10.4s,v2.4s,v3.4s 1886 zip2 v11.4s,v2.4s,v3.4s 1887 zip1 v0.2d,v8.2d,v10.2d 1888 zip2 v1.2d,v8.2d,v10.2d 1889 zip1 v2.2d,v9.2d,v11.2d 1890 zip2 v3.2d,v9.2d,v11.2d 1891 zip1 v8.4s,v4.4s,v5.4s 1892 zip2 v9.4s,v4.4s,v5.4s 1893 zip1 v10.4s,v6.4s,v7.4s 1894 zip2 v11.4s,v6.4s,v7.4s 1895 zip1 v4.2d,v8.2d,v10.2d 1896 zip2 v5.2d,v8.2d,v10.2d 1897 zip1 v6.2d,v9.2d,v11.2d 1898 zip2 v7.2d,v9.2d,v11.2d 1899 mov x24, sp 1900 ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64 1901 eor v0.16b, v0.16b, v16.16b 1902 eor v1.16b, v1.16b, v17.16b 1903 eor v2.16b, v2.16b, v18.16b 1904 eor v3.16b, v3.16b, v19.16b 1905 1906 ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x24],#64 1907 eor v4.16b, v4.16b, v16.16b 1908 eor v5.16b, v5.16b, v17.16b 1909 eor v6.16b, v6.16b, v18.16b 1910 eor v7.16b, v7.16b, v19.16b 1911 1912 // save the last tweak 1913 mov v24.16b,v19.16b 1914 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 1915 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1916 subs x2,x2,#8 1917 b.gt .Lxts_8_blocks_process 1918 b 100f 1919.Lxts_4_blocks_process: 1920 cmp x2,#4 1921 b.lt 1f 1922 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1923 rbit v16.16b,v16.16b 1924 rbit v17.16b,v17.16b 1925 rbit v18.16b,v18.16b 1926 rbit v19.16b,v19.16b 1927 eor v4.16b, v4.16b, v16.16b 1928 eor v5.16b, v5.16b, v17.16b 1929 eor v6.16b, v6.16b, v18.16b 1930 eor v7.16b, v7.16b, v19.16b 1931#ifndef HITLS_BIG_ENDIAN 1932 rev32 v4.16b,v4.16b 1933 rev32 v5.16b,v5.16b 1934 rev32 v6.16b,v6.16b 1935 rev32 v7.16b,v7.16b 1936#endif 1937 zip1 v0.4s,v4.4s,v5.4s 1938 zip2 v1.4s,v4.4s,v5.4s 1939 zip1 v2.4s,v6.4s,v7.4s 1940 zip2 v3.4s,v6.4s,v7.4s 1941 zip1 v4.2d,v0.2d,v2.2d 1942 zip2 v5.2d,v0.2d,v2.2d 1943 zip1 v6.2d,v1.2d,v3.2d 1944 zip2 v7.2d,v1.2d,v3.2d 1945 bl Sm4Enc4blks 1946 zip1 v4.4s,v0.4s,v1.4s 1947 zip2 v5.4s,v0.4s,v1.4s 1948 zip1 v6.4s,v2.4s,v3.4s 1949 zip2 v7.4s,v2.4s,v3.4s 1950 zip1 v0.2d,v4.2d,v6.2d 1951 zip2 v1.2d,v4.2d,v6.2d 1952 zip1 v2.2d,v5.2d,v7.2d 1953 zip2 v3.2d,v5.2d,v7.2d 1954 eor v0.16b, v0.16b, v16.16b 1955 eor v1.16b, v1.16b, v17.16b 1956 eor v2.16b, v2.16b, v18.16b 1957 eor v3.16b, v3.16b, v19.16b 1958 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 1959 sub x2,x2,#4 1960 // save the last tweak 1961 mov v24.16b,v19.16b 1962 mov v16.d[0],x12 1963 mov v16.d[1],x13 1964#ifdef HITLS_BIG_ENDIAN 1965 rev32 v16.16b,v16.16b 1966#endif 1967 mov w7,0x87 1968 extr x9,x19,x19,#32 1969 extr x13,x19,x18,#63 1970 and w8,w7,w9,asr#31 1971 eor x12,x8,x18,lsl#1 1972 mov v17.d[0],x14 1973 mov v17.d[1],x15 1974#ifdef HITLS_BIG_ENDIAN 1975 rev32 v17.16b,v17.16b 1976#endif 1977 mov w7,0x87 1978 extr x9,x13,x13,#32 1979 extr x15,x13,x12,#63 1980 and w8,w7,w9,asr#31 1981 eor x14,x8,x12,lsl#1 1982 mov v18.d[0],x16 1983 mov v18.d[1],x17 1984#ifdef HITLS_BIG_ENDIAN 1985 rev32 v18.16b,v18.16b 1986#endif 1987 mov w7,0x87 1988 extr x9,x15,x15,#32 1989 extr x17,x15,x14,#63 1990 and w8,w7,w9,asr#31 1991 eor x16,x8,x14,lsl#1 1992 mov v19.d[0],x18 1993 mov v19.d[1],x19 1994#ifdef HITLS_BIG_ENDIAN 1995 rev32 v19.16b,v19.16b 1996#endif 1997 mov w7,0x87 1998 extr x9,x17,x17,#32 1999 extr x19,x17,x16,#63 2000 and w8,w7,w9,asr#31 2001 eor x18,x8,x16,lsl#1 20021: 2003 // process last block 2004 cmp x2,#1 2005 b.lt 100f 2006 b.gt 1f 2007 ld1 {v4.4s},[x0],#16 2008 rbit v16.16b,v16.16b 2009 eor v4.16b, v4.16b, v16.16b 2010#ifndef HITLS_BIG_ENDIAN 2011 rev32 v4.16b,v4.16b 2012#endif 2013 mov x10,x3 2014 mov w11,#8 2015 mov w12,v4.s[0] 2016 mov w13,v4.s[1] 2017 mov w14,v4.s[2] 2018 mov w15,v4.s[3] 201910: 2020 ldp w7,w8,[x10],8 2021 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2022 eor w6,w14,w15 2023 eor w9,w7,w13 2024 eor w6,w6,w9 2025 movi v31.16b, #0x0f 2026 mov v3.s[0],w6 2027 // optimize sbox using AESE instruction 2028 tbl v0.16b, {v3.16b}, v26.16b 2029 ushr v2.16b, v0.16b, 4 2030 and v0.16b, v0.16b, v31.16b 2031 tbl v0.16b, {v28.16b}, v0.16b 2032 tbl v2.16b, {v27.16b}, v2.16b 2033 eor v0.16b, v0.16b, v2.16b 2034 eor v1.16b, v1.16b, v1.16b 2035 aese v0.16b,v1.16b 2036 ushr v2.16b, v0.16b, 4 2037 and v0.16b, v0.16b, v31.16b 2038 tbl v0.16b, {v30.16b}, v0.16b 2039 tbl v2.16b, {v29.16b}, v2.16b 2040 eor v0.16b, v0.16b, v2.16b 2041 2042 mov w7,v0.s[0] 2043 eor w6,w7,w7,ror #32-2 2044 eor w6,w6,w7,ror #32-10 2045 eor w6,w6,w7,ror #32-18 2046 eor w6,w6,w7,ror #32-24 2047 eor w12,w12,w6 2048 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2049 eor w6,w14,w15 2050 eor w9,w12,w8 2051 eor w6,w6,w9 2052 movi v31.16b, #0x0f 2053 mov v3.s[0],w6 2054 // optimize sbox using AESE instruction 2055 tbl v0.16b, {v3.16b}, v26.16b 2056 ushr v2.16b, v0.16b, 4 2057 and v0.16b, v0.16b, v31.16b 2058 tbl v0.16b, {v28.16b}, v0.16b 2059 tbl v2.16b, {v27.16b}, v2.16b 2060 eor v0.16b, v0.16b, v2.16b 2061 eor v1.16b, v1.16b, v1.16b 2062 aese v0.16b,v1.16b 2063 ushr v2.16b, v0.16b, 4 2064 and v0.16b, v0.16b, v31.16b 2065 tbl v0.16b, {v30.16b}, v0.16b 2066 tbl v2.16b, {v29.16b}, v2.16b 2067 eor v0.16b, v0.16b, v2.16b 2068 2069 mov w7,v0.s[0] 2070 eor w6,w7,w7,ror #32-2 2071 eor w6,w6,w7,ror #32-10 2072 eor w6,w6,w7,ror #32-18 2073 eor w6,w6,w7,ror #32-24 2074 ldp w7,w8,[x10],8 2075 eor w13,w13,w6 2076 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2077 eor w6,w12,w13 2078 eor w9,w7,w15 2079 eor w6,w6,w9 2080 movi v31.16b, #0x0f 2081 mov v3.s[0],w6 2082 // optimize sbox using AESE instruction 2083 tbl v0.16b, {v3.16b}, v26.16b 2084 ushr v2.16b, v0.16b, 4 2085 and v0.16b, v0.16b, v31.16b 2086 tbl v0.16b, {v28.16b}, v0.16b 2087 tbl v2.16b, {v27.16b}, v2.16b 2088 eor v0.16b, v0.16b, v2.16b 2089 eor v1.16b, v1.16b, v1.16b 2090 aese v0.16b,v1.16b 2091 ushr v2.16b, v0.16b, 4 2092 and v0.16b, v0.16b, v31.16b 2093 tbl v0.16b, {v30.16b}, v0.16b 2094 tbl v2.16b, {v29.16b}, v2.16b 2095 eor v0.16b, v0.16b, v2.16b 2096 2097 mov w7,v0.s[0] 2098 eor w6,w7,w7,ror #32-2 2099 eor w6,w6,w7,ror #32-10 2100 eor w6,w6,w7,ror #32-18 2101 eor w6,w6,w7,ror #32-24 2102 eor w14,w14,w6 2103 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2104 eor w6,w12,w13 2105 eor w9,w14,w8 2106 eor w6,w6,w9 2107 movi v31.16b, #0x0f 2108 mov v3.s[0],w6 2109 // optimize sbox using AESE instruction 2110 tbl v0.16b, {v3.16b}, v26.16b 2111 ushr v2.16b, v0.16b, 4 2112 and v0.16b, v0.16b, v31.16b 2113 tbl v0.16b, {v28.16b}, v0.16b 2114 tbl v2.16b, {v27.16b}, v2.16b 2115 eor v0.16b, v0.16b, v2.16b 2116 eor v1.16b, v1.16b, v1.16b 2117 aese v0.16b,v1.16b 2118 ushr v2.16b, v0.16b, 4 2119 and v0.16b, v0.16b, v31.16b 2120 tbl v0.16b, {v30.16b}, v0.16b 2121 tbl v2.16b, {v29.16b}, v2.16b 2122 eor v0.16b, v0.16b, v2.16b 2123 2124 mov w7,v0.s[0] 2125 eor w6,w7,w7,ror #32-2 2126 eor w6,w6,w7,ror #32-10 2127 eor w6,w6,w7,ror #32-18 2128 eor w6,w6,w7,ror #32-24 2129 eor w15,w15,w6 2130 subs w11,w11,#1 2131 b.ne 10b 2132 mov v4.s[0],w15 2133 mov v4.s[1],w14 2134 mov v4.s[2],w13 2135 mov v4.s[3],w12 2136#ifndef HITLS_BIG_ENDIAN 2137 rev32 v4.16b,v4.16b 2138#endif 2139 eor v4.16b, v4.16b, v16.16b 2140 st1 {v4.4s},[x1],#16 2141 // save the last tweak 2142 mov v24.16b,v16.16b 2143 b 100f 21441: // process last 2 blocks 2145 cmp x2,#2 2146 b.gt 1f 2147 ld1 {v4.4s,v5.4s},[x0],#32 2148 rbit v16.16b,v16.16b 2149 rbit v17.16b,v17.16b 2150 eor v4.16b, v4.16b, v16.16b 2151 eor v5.16b, v5.16b, v17.16b 2152#ifndef HITLS_BIG_ENDIAN 2153 rev32 v4.16b,v4.16b 2154 rev32 v5.16b,v5.16b 2155#endif 2156 zip1 v0.4s,v4.4s,v5.4s 2157 zip2 v1.4s,v4.4s,v5.4s 2158 zip1 v2.4s,v6.4s,v7.4s 2159 zip2 v3.4s,v6.4s,v7.4s 2160 zip1 v4.2d,v0.2d,v2.2d 2161 zip2 v5.2d,v0.2d,v2.2d 2162 zip1 v6.2d,v1.2d,v3.2d 2163 zip2 v7.2d,v1.2d,v3.2d 2164 bl Sm4Enc4blks 2165 zip1 v4.4s,v0.4s,v1.4s 2166 zip2 v5.4s,v0.4s,v1.4s 2167 zip1 v6.4s,v2.4s,v3.4s 2168 zip2 v7.4s,v2.4s,v3.4s 2169 zip1 v0.2d,v4.2d,v6.2d 2170 zip2 v1.2d,v4.2d,v6.2d 2171 zip1 v2.2d,v5.2d,v7.2d 2172 zip2 v3.2d,v5.2d,v7.2d 2173 eor v0.16b, v0.16b, v16.16b 2174 eor v1.16b, v1.16b, v17.16b 2175 st1 {v0.4s,v1.4s},[x1],#32 2176 // save the last tweak 2177 mov v24.16b,v17.16b 2178 b 100f 21791: // process last 3 blocks 2180 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 2181 rbit v16.16b,v16.16b 2182 rbit v17.16b,v17.16b 2183 rbit v18.16b,v18.16b 2184 eor v4.16b, v4.16b, v16.16b 2185 eor v5.16b, v5.16b, v17.16b 2186 eor v6.16b, v6.16b, v18.16b 2187#ifndef HITLS_BIG_ENDIAN 2188 rev32 v4.16b,v4.16b 2189 rev32 v5.16b,v5.16b 2190 rev32 v6.16b,v6.16b 2191#endif 2192 zip1 v0.4s,v4.4s,v5.4s 2193 zip2 v1.4s,v4.4s,v5.4s 2194 zip1 v2.4s,v6.4s,v7.4s 2195 zip2 v3.4s,v6.4s,v7.4s 2196 zip1 v4.2d,v0.2d,v2.2d 2197 zip2 v5.2d,v0.2d,v2.2d 2198 zip1 v6.2d,v1.2d,v3.2d 2199 zip2 v7.2d,v1.2d,v3.2d 2200 bl Sm4Enc4blks 2201 zip1 v4.4s,v0.4s,v1.4s 2202 zip2 v5.4s,v0.4s,v1.4s 2203 zip1 v6.4s,v2.4s,v3.4s 2204 zip2 v7.4s,v2.4s,v3.4s 2205 zip1 v0.2d,v4.2d,v6.2d 2206 zip2 v1.2d,v4.2d,v6.2d 2207 zip1 v2.2d,v5.2d,v7.2d 2208 zip2 v3.2d,v5.2d,v7.2d 2209 eor v0.16b, v0.16b, v16.16b 2210 eor v1.16b, v1.16b, v17.16b 2211 eor v2.16b, v2.16b, v18.16b 2212 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 2213 // save the last tweak 2214 mov v24.16b,v18.16b 2215100: 2216 cmp x29,0 2217 b.eq .Lxts_cipher_return 2218 2219// This branch calculates the last two tweaks, 2220// while the encryption/decryption length is larger than 32 2221.last_2blks_tweak: 2222#ifdef HITLS_BIG_ENDIAN 2223 rev32 v24.16b,v24.16b 2224#endif 2225 rbit v2.16b,v24.16b 2226 adrp x26, .Lxts_magic 2227 add x26, x26, :lo12:.Lxts_magic 2228 ldr q0, [x26] 2229 shl v17.16b, v2.16b, #1 2230 ext v1.16b, v2.16b, v2.16b,#15 2231 ushr v1.16b, v1.16b, #7 2232 mul v1.16b, v1.16b, v0.16b 2233 eor v17.16b, v17.16b, v1.16b 2234 rbit v17.16b,v17.16b 2235 rbit v2.16b,v17.16b 2236 adrp x26, .Lxts_magic 2237 add x26, x26, :lo12:.Lxts_magic 2238 ldr q0, [x26] 2239 shl v18.16b, v2.16b, #1 2240 ext v1.16b, v2.16b, v2.16b,#15 2241 ushr v1.16b, v1.16b, #7 2242 mul v1.16b, v1.16b, v0.16b 2243 eor v18.16b, v18.16b, v1.16b 2244 rbit v18.16b,v18.16b 2245 b .Lxts_check_dec 2246 2247 2248// This branch calculates the last two tweaks, 2249// while the encryption/decryption length is equal to 32, who only need two tweaks 2250.only_2blks_tweak: 2251 mov v17.16b,v16.16b 2252#ifdef HITLS_BIG_ENDIAN 2253 rev32 v17.16b,v17.16b 2254#endif 2255 rbit v2.16b,v17.16b 2256 adrp x26, .Lxts_magic 2257 add x26, x26, :lo12:.Lxts_magic 2258 ldr q0, [x26] 2259 shl v18.16b, v2.16b, #1 2260 ext v1.16b, v2.16b, v2.16b,#15 2261 ushr v1.16b, v1.16b, #7 2262 mul v1.16b, v1.16b, v0.16b 2263 eor v18.16b, v18.16b, v1.16b 2264 rbit v18.16b,v18.16b 2265 b .Lxts_check_dec 2266 2267 2268// Determine whether encryption or decryption is required. 2269// The last two tweaks need to be swapped for decryption. 2270.Lxts_check_dec: 2271 // encryption:1 decryption:0 2272 cmp w28,1 2273 b.eq .Lxts_prcess_last_2blks 2274 mov v0.16B,v17.16b 2275 mov v17.16B,v18.16b 2276 mov v18.16B,v0.16b 2277 2278.Lxts_prcess_last_2blks: 2279#ifdef HITLS_BIG_ENDIAN 2280 rev32 v17.16b,v17.16b 2281 rev32 v18.16b,v18.16b 2282#endif 2283 ld1 {v4.4s},[x0],#16 2284 eor v4.16b, v4.16b, v17.16b 2285#ifndef HITLS_BIG_ENDIAN 2286 rev32 v4.16b,v4.16b 2287#endif 2288 mov x10,x3 2289 mov w11,#8 2290 mov w12,v4.s[0] 2291 mov w13,v4.s[1] 2292 mov w14,v4.s[2] 2293 mov w15,v4.s[3] 229410: 2295 ldp w7,w8,[x10],8 2296 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2297 eor w6,w14,w15 2298 eor w9,w7,w13 2299 eor w6,w6,w9 2300 movi v31.16b, #0x0f 2301 mov v3.s[0],w6 2302 // optimize sbox using AESE instruction 2303 tbl v0.16b, {v3.16b}, v26.16b 2304 ushr v2.16b, v0.16b, 4 2305 and v0.16b, v0.16b, v31.16b 2306 tbl v0.16b, {v28.16b}, v0.16b 2307 tbl v2.16b, {v27.16b}, v2.16b 2308 eor v0.16b, v0.16b, v2.16b 2309 eor v1.16b, v1.16b, v1.16b 2310 aese v0.16b,v1.16b 2311 ushr v2.16b, v0.16b, 4 2312 and v0.16b, v0.16b, v31.16b 2313 tbl v0.16b, {v30.16b}, v0.16b 2314 tbl v2.16b, {v29.16b}, v2.16b 2315 eor v0.16b, v0.16b, v2.16b 2316 2317 mov w7,v0.s[0] 2318 eor w6,w7,w7,ror #32-2 2319 eor w6,w6,w7,ror #32-10 2320 eor w6,w6,w7,ror #32-18 2321 eor w6,w6,w7,ror #32-24 2322 eor w12,w12,w6 2323 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2324 eor w6,w14,w15 2325 eor w9,w12,w8 2326 eor w6,w6,w9 2327 movi v31.16b, #0x0f 2328 mov v3.s[0],w6 2329 // optimize sbox using AESE instruction 2330 tbl v0.16b, {v3.16b}, v26.16b 2331 ushr v2.16b, v0.16b, 4 2332 and v0.16b, v0.16b, v31.16b 2333 tbl v0.16b, {v28.16b}, v0.16b 2334 tbl v2.16b, {v27.16b}, v2.16b 2335 eor v0.16b, v0.16b, v2.16b 2336 eor v1.16b, v1.16b, v1.16b 2337 aese v0.16b,v1.16b 2338 ushr v2.16b, v0.16b, 4 2339 and v0.16b, v0.16b, v31.16b 2340 tbl v0.16b, {v30.16b}, v0.16b 2341 tbl v2.16b, {v29.16b}, v2.16b 2342 eor v0.16b, v0.16b, v2.16b 2343 2344 mov w7,v0.s[0] 2345 eor w6,w7,w7,ror #32-2 2346 eor w6,w6,w7,ror #32-10 2347 eor w6,w6,w7,ror #32-18 2348 eor w6,w6,w7,ror #32-24 2349 ldp w7,w8,[x10],8 2350 eor w13,w13,w6 2351 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2352 eor w6,w12,w13 2353 eor w9,w7,w15 2354 eor w6,w6,w9 2355 movi v31.16b, #0x0f 2356 mov v3.s[0],w6 2357 // optimize sbox using AESE instruction 2358 tbl v0.16b, {v3.16b}, v26.16b 2359 ushr v2.16b, v0.16b, 4 2360 and v0.16b, v0.16b, v31.16b 2361 tbl v0.16b, {v28.16b}, v0.16b 2362 tbl v2.16b, {v27.16b}, v2.16b 2363 eor v0.16b, v0.16b, v2.16b 2364 eor v1.16b, v1.16b, v1.16b 2365 aese v0.16b,v1.16b 2366 ushr v2.16b, v0.16b, 4 2367 and v0.16b, v0.16b, v31.16b 2368 tbl v0.16b, {v30.16b}, v0.16b 2369 tbl v2.16b, {v29.16b}, v2.16b 2370 eor v0.16b, v0.16b, v2.16b 2371 2372 mov w7,v0.s[0] 2373 eor w6,w7,w7,ror #32-2 2374 eor w6,w6,w7,ror #32-10 2375 eor w6,w6,w7,ror #32-18 2376 eor w6,w6,w7,ror #32-24 2377 eor w14,w14,w6 2378 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2379 eor w6,w12,w13 2380 eor w9,w14,w8 2381 eor w6,w6,w9 2382 movi v31.16b, #0x0f 2383 mov v3.s[0],w6 2384 // optimize sbox using AESE instruction 2385 tbl v0.16b, {v3.16b}, v26.16b 2386 ushr v2.16b, v0.16b, 4 2387 and v0.16b, v0.16b, v31.16b 2388 tbl v0.16b, {v28.16b}, v0.16b 2389 tbl v2.16b, {v27.16b}, v2.16b 2390 eor v0.16b, v0.16b, v2.16b 2391 eor v1.16b, v1.16b, v1.16b 2392 aese v0.16b,v1.16b 2393 ushr v2.16b, v0.16b, 4 2394 and v0.16b, v0.16b, v31.16b 2395 tbl v0.16b, {v30.16b}, v0.16b 2396 tbl v2.16b, {v29.16b}, v2.16b 2397 eor v0.16b, v0.16b, v2.16b 2398 2399 mov w7,v0.s[0] 2400 eor w6,w7,w7,ror #32-2 2401 eor w6,w6,w7,ror #32-10 2402 eor w6,w6,w7,ror #32-18 2403 eor w6,w6,w7,ror #32-24 2404 eor w15,w15,w6 2405 subs w11,w11,#1 2406 b.ne 10b 2407 mov v4.s[0],w15 2408 mov v4.s[1],w14 2409 mov v4.s[2],w13 2410 mov v4.s[3],w12 2411#ifndef HITLS_BIG_ENDIAN 2412 rev32 v4.16b,v4.16b 2413#endif 2414 eor v4.16b, v4.16b, v17.16b 2415 st1 {v4.4s},[x1],#16 2416 2417 sub x26,x1,16 2418.Lxts_loop: 2419 subs x29,x29,1 2420 ldrb w7,[x26,x29] 2421 ldrb w8,[x0,x29] 2422 strb w8,[x26,x29] 2423 strb w7,[x1,x29] 2424 b.gt .Lxts_loop 2425 ld1 {v4.4s}, [x26] 2426 eor v4.16b, v4.16b, v18.16b 2427#ifndef HITLS_BIG_ENDIAN 2428 rev32 v4.16b,v4.16b 2429#endif 2430 mov x10,x3 2431 mov w11,#8 2432 mov w12,v4.s[0] 2433 mov w13,v4.s[1] 2434 mov w14,v4.s[2] 2435 mov w15,v4.s[3] 243610: 2437 ldp w7,w8,[x10],8 2438 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2439 eor w6,w14,w15 2440 eor w9,w7,w13 2441 eor w6,w6,w9 2442 movi v31.16b, #0x0f 2443 mov v3.s[0],w6 2444 // optimize sbox using AESE instruction 2445 tbl v0.16b, {v3.16b}, v26.16b 2446 ushr v2.16b, v0.16b, 4 2447 and v0.16b, v0.16b, v31.16b 2448 tbl v0.16b, {v28.16b}, v0.16b 2449 tbl v2.16b, {v27.16b}, v2.16b 2450 eor v0.16b, v0.16b, v2.16b 2451 eor v1.16b, v1.16b, v1.16b 2452 aese v0.16b,v1.16b 2453 ushr v2.16b, v0.16b, 4 2454 and v0.16b, v0.16b, v31.16b 2455 tbl v0.16b, {v30.16b}, v0.16b 2456 tbl v2.16b, {v29.16b}, v2.16b 2457 eor v0.16b, v0.16b, v2.16b 2458 2459 mov w7,v0.s[0] 2460 eor w6,w7,w7,ror #32-2 2461 eor w6,w6,w7,ror #32-10 2462 eor w6,w6,w7,ror #32-18 2463 eor w6,w6,w7,ror #32-24 2464 eor w12,w12,w6 2465 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2466 eor w6,w14,w15 2467 eor w9,w12,w8 2468 eor w6,w6,w9 2469 movi v31.16b, #0x0f 2470 mov v3.s[0],w6 2471 // optimize sbox using AESE instruction 2472 tbl v0.16b, {v3.16b}, v26.16b 2473 ushr v2.16b, v0.16b, 4 2474 and v0.16b, v0.16b, v31.16b 2475 tbl v0.16b, {v28.16b}, v0.16b 2476 tbl v2.16b, {v27.16b}, v2.16b 2477 eor v0.16b, v0.16b, v2.16b 2478 eor v1.16b, v1.16b, v1.16b 2479 aese v0.16b,v1.16b 2480 ushr v2.16b, v0.16b, 4 2481 and v0.16b, v0.16b, v31.16b 2482 tbl v0.16b, {v30.16b}, v0.16b 2483 tbl v2.16b, {v29.16b}, v2.16b 2484 eor v0.16b, v0.16b, v2.16b 2485 2486 mov w7,v0.s[0] 2487 eor w6,w7,w7,ror #32-2 2488 eor w6,w6,w7,ror #32-10 2489 eor w6,w6,w7,ror #32-18 2490 eor w6,w6,w7,ror #32-24 2491 ldp w7,w8,[x10],8 2492 eor w13,w13,w6 2493 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2494 eor w6,w12,w13 2495 eor w9,w7,w15 2496 eor w6,w6,w9 2497 movi v31.16b, #0x0f 2498 mov v3.s[0],w6 2499 // optimize sbox using AESE instruction 2500 tbl v0.16b, {v3.16b}, v26.16b 2501 ushr v2.16b, v0.16b, 4 2502 and v0.16b, v0.16b, v31.16b 2503 tbl v0.16b, {v28.16b}, v0.16b 2504 tbl v2.16b, {v27.16b}, v2.16b 2505 eor v0.16b, v0.16b, v2.16b 2506 eor v1.16b, v1.16b, v1.16b 2507 aese v0.16b,v1.16b 2508 ushr v2.16b, v0.16b, 4 2509 and v0.16b, v0.16b, v31.16b 2510 tbl v0.16b, {v30.16b}, v0.16b 2511 tbl v2.16b, {v29.16b}, v2.16b 2512 eor v0.16b, v0.16b, v2.16b 2513 2514 mov w7,v0.s[0] 2515 eor w6,w7,w7,ror #32-2 2516 eor w6,w6,w7,ror #32-10 2517 eor w6,w6,w7,ror #32-18 2518 eor w6,w6,w7,ror #32-24 2519 eor w14,w14,w6 2520 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2521 eor w6,w12,w13 2522 eor w9,w14,w8 2523 eor w6,w6,w9 2524 movi v31.16b, #0x0f 2525 mov v3.s[0],w6 2526 // optimize sbox using AESE instruction 2527 tbl v0.16b, {v3.16b}, v26.16b 2528 ushr v2.16b, v0.16b, 4 2529 and v0.16b, v0.16b, v31.16b 2530 tbl v0.16b, {v28.16b}, v0.16b 2531 tbl v2.16b, {v27.16b}, v2.16b 2532 eor v0.16b, v0.16b, v2.16b 2533 eor v1.16b, v1.16b, v1.16b 2534 aese v0.16b,v1.16b 2535 ushr v2.16b, v0.16b, 4 2536 and v0.16b, v0.16b, v31.16b 2537 tbl v0.16b, {v30.16b}, v0.16b 2538 tbl v2.16b, {v29.16b}, v2.16b 2539 eor v0.16b, v0.16b, v2.16b 2540 2541 mov w7,v0.s[0] 2542 eor w6,w7,w7,ror #32-2 2543 eor w6,w6,w7,ror #32-10 2544 eor w6,w6,w7,ror #32-18 2545 eor w6,w6,w7,ror #32-24 2546 eor w15,w15,w6 2547 subs w11,w11,#1 2548 b.ne 10b 2549 mov v4.s[0],w15 2550 mov v4.s[1],w14 2551 mov v4.s[2],w13 2552 mov v4.s[3],w12 2553#ifndef HITLS_BIG_ENDIAN 2554 rev32 v4.16b,v4.16b 2555#endif 2556 eor v4.16b, v4.16b, v18.16b 2557 st1 {v4.4s}, [x26] 2558.Lxts_cipher_return: 2559 add sp, sp, #192 2560 ldp d14, d15, [sp], #0x10 2561 ldp d12, d13, [sp], #0x10 2562 ldp d10, d11, [sp], #0x10 2563 ldp d8, d9, [sp], #0x10 2564 ldp x29, x30, [sp], #0x10 2565 ldp x27, x28, [sp], #0x10 2566 ldp x25, x26, [sp], #0x10 2567 ldp x23, x24, [sp], #0x10 2568 ldp x21, x22, [sp], #0x10 2569 ldp x19, x20, [sp], #0x10 2570AARCH64_AUTIASP 2571 ret 2572.size Vpsm4XtsCipher,.-Vpsm4XtsCipher 2573 2574.globl Vpsm4Cfb128Encrypt 2575.type Vpsm4Cfb128Encrypt,%function 2576.align 5 2577Vpsm4Cfb128Encrypt: 2578AARCH64_PACIASP 2579 stp x29,x30,[sp,#-80]! 2580 add x29,sp,#0 2581 stp x19,x20,[sp,#16] 2582 stp x21,x22,[sp,#32] 2583 stp x23,x8,[sp,#48] 2584 stp x16,x17,[sp,#64] 2585 2586 // load tbox 2587 adrp x19, .Ltbox1 2588 add x19,x19,:lo12:.Ltbox1 2589 adrp x20, .Ltbox2 2590 add x20,x20,:lo12:.Ltbox2 2591 adrp x21, .Ltbox3 2592 add x21,x21,:lo12:.Ltbox3 2593 adrp x22, .Ltbox4 2594 add x22,x22,:lo12:.Ltbox4 2595 2596 // load num 2597 ldr w23,[x5] 2598 cbz w23,.Lcfb128_enc_update 2599.Lcfb128_enc_init: 2600 ldrb w7,[ivp,x23] 2601 ldrb w8,[inp] 2602 eor w7,w7,w8 2603 strb w7,[outp] 2604 strb w7,[ivp,x23] 2605 2606 add inp,inp,#1 2607 add outp,outp,#1 2608 add w23,w23,#1 2609 sub len,len,#1 2610 cmp w23,#16 2611 b.eq .Lcfb128_enc_init_final 2612 cbz len,.Lcfb128_enc_ret 2613 b .Lcfb128_enc_init 2614.Lcfb128_enc_init_final: 2615 mov w23,#0 2616.Lcfb128_enc_update: 2617 cbz len,.Lcfb128_enc_ret 2618 // load iv 2619 ldp w8,w9,[ivp] 2620 ldp w10,w11,[ivp,#8] 2621#ifndef HITLS_BIG_ENDIAN 2622 rev w8,w8 2623 rev w9,w9 2624 rev w10,w10 2625 rev w11,w11 2626#endif 2627 EncRound 2628#ifndef HITLS_BIG_ENDIAN 2629 rev w8,w8 2630 rev w9,w9 2631 rev w10,w10 2632 rev w11,w11 2633#endif 2634 // save back IV 2635 stp w11,w10,[ivp] 2636 stp w9,w8,[ivp,#8] 2637 2638 cmp len,#16 2639 b.lt .Lcfb128_enc_final 2640 // xor with plain 2641 ldp w6,w7,[inp],#8 2642 ldp w16,w17,[inp],#8 2643 eor w11,w11,w6 2644 eor w10,w10,w7 2645 eor w9,w9,w16 2646 eor w8,w8,w17 2647 2648 stp w11,w10,[outp],#8 2649 stp w9,w8,[outp],#8 2650 // save back IV 2651 stp w11,w10,[ivp] 2652 stp w9,w8,[ivp,#8] 2653 2654 sub len,len,#16 2655 b .Lcfb128_enc_update 2656.Lcfb128_enc_final: 2657 ldrb w7,[ivp,x23] 2658 ldrb w8,[inp] 2659 eor w7,w7,w8 2660 strb w7,[outp] 2661 strb w7,[ivp,x23] 2662 2663 add inp,inp,#1 2664 add outp,outp,#1 2665 add w23,w23,#1 2666 subs len,len,#1 2667 b.ne .Lcfb128_enc_final 2668.Lcfb128_enc_ret: 2669 // store num 2670 str w23,[x5] 2671 2672 // restore register 2673 ldp x19,x20,[sp,#16] 2674 ldp x21,x22,[sp,#32] 2675 ldp x23,x8,[sp,#48] 2676 ldp x16,x17,[sp,#64] 2677 ldp x29,x30,[sp],#80 2678AARCH64_AUTIASP 2679 ret 2680.size Vpsm4Cfb128Encrypt,.-Vpsm4Cfb128Encrypt 2681 2682# void Vpsm4Cfb128Decrypt(const uint8_t *in, uint8_t *out, uint64_t len, const uint32_t *key, uint8_t *iv, int *num); 2683.globl Vpsm4Cfb128Decrypt 2684.type Vpsm4Cfb128Decrypt,%function 2685.align 5 2686Vpsm4Cfb128Decrypt: 2687AARCH64_PACIASP 2688 stp x29,x30,[sp,#-128]! 2689 stp x19,x20,[sp,#16] 2690 stp x21,x22,[sp,#32] 2691 stp x23,x24,[sp,#48] 2692 stp d8,d9,[sp,#64] 2693 stp d10,d11,[sp,#80] 2694 stp d12,d13,[sp,#96] 2695 stp d14,d15,[sp,#112] 2696 2697 // load tbox 2698 adrp x19, .Ltbox1 2699 add x19,x19,:lo12:.Ltbox1 2700 adrp x20, .Ltbox2 2701 add x20,x20,:lo12:.Ltbox2 2702 adrp x21, .Ltbox3 2703 add x21,x21,:lo12:.Ltbox3 2704 adrp x22, .Ltbox4 2705 add x22,x22,:lo12:.Ltbox4 2706 LoadSbox 2707// load num 2708 ldr w23,[x5] 2709 cbz w23,.Lcfb128_12_blocks_dec 2710 2711.Lcfb128_dec_init: 2712 ldrb w7,[ivp,x23] 2713 ldrb w8,[inp] 2714 eor w7,w7,w8 2715 strb w7,[outp] 2716 // store in to iv 2717 strb w8,[ivp,x23] 2718 2719 add inp,inp,#1 2720 add outp,outp,#1 2721 subs len,len,#1 2722 add w23,w23,#1 2723 and w23,w23,#15 2724 b.eq 100f 2725 cbz w23,.Lcfb128_12_blocks_dec 2726 b .Lcfb128_dec_init 2727 2728.Lcfb128_12_blocks_dec: 2729 cmp len,#192 2730 b.lt .Lcfb128_8_blocks_dec 2731 2732 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[inp] 2733 // append iv as last element 2734 ld4 {v4.s,v5.s,v6.s,v7.s}[3],[ivp] 2735 add ptr,inp,#48 2736 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[ptr] 2737 add ptr,ptr,#64 2738 ld4 {v16.4s,v17.4s,v18.4s,v19.4s},[ptr] 2739#ifndef HITLS_BIG_ENDIAN 2740 rev32 v4.16b,v4.16b 2741 rev32 v5.16b,v5.16b 2742 rev32 v6.16b,v6.16b 2743 rev32 v7.16b,v7.16b 2744 2745 rev32 v8.16b,v8.16b 2746 rev32 v9.16b,v9.16b 2747 rev32 v10.16b,v10.16b 2748 rev32 v11.16b,v11.16b 2749 2750 rev32 v16.16b,v16.16b 2751 rev32 v17.16b,v17.16b 2752 rev32 v18.16b,v18.16b 2753 rev32 v19.16b,v19.16b 2754#endif 2755 bl Sm4Enc12blks 2756 2757 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d 2758 transpose v4.4s,v5.4s,v6.4s,v7.4s,v4.2d,v5.2d,v6.2d,v7.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d 2759 transpose v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d,v16.4s,v17.4s,v18.4s,v19.4s,v16.2d,v17.2d,v18.2d,v19.2d 2760 2761 ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64 2762 ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64 2763 eor v0.16b,v0.16b,v17.16b 2764 eor v1.16b,v1.16b,v18.16b 2765 eor v2.16b,v2.16b,v19.16b 2766 eor v3.16b,v3.16b,v16.16b 2767 // save plainText decrypted from iv as first one 2768 st1 {v3.4s},[outp],#16 2769 st1 {v0.4s,v1.4s,v2.4s},[outp],#48 2770 2771 eor v4.16b,v4.16b,v12.16b 2772 eor v5.16b,v5.16b,v13.16b 2773 eor v6.16b,v6.16b,v14.16b 2774 eor v7.16b,v7.16b,v15.16b 2775 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64 2776 2777 ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[inp],#64 2778 eor v8.16b,v8.16b,v16.16b 2779 eor v9.16b,v9.16b,v17.16b 2780 eor v10.16b,v10.16b,v18.16b 2781 eor v11.16b,v11.16b,v19.16b 2782 st1 {v8.4s,v9.4s,v10.4s,v11.4s},[outp],#64 2783 // save back IV 2784 st1 {v19.4s}, [ivp] 2785 2786 subs len,len,#192 2787 b.gt .Lcfb128_12_blocks_dec 2788 b.eq 100f 2789 2790.Lcfb128_8_blocks_dec: 2791 cmp len,#128 2792 b.lt .Lcfb128_4_blocks_dec 2793 2794 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[inp] 2795 // append iv as last element 2796 ld4 {v4.s,v5.s,v6.s,v7.s}[3],[ivp] 2797 add ptr,inp,#48 2798 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[ptr] 2799#ifndef HITLS_BIG_ENDIAN 2800 rev32 v4.16b,v4.16b 2801 rev32 v5.16b,v5.16b 2802 rev32 v6.16b,v6.16b 2803 rev32 v7.16b,v7.16b 2804 rev32 v8.16b,v8.16b 2805 rev32 v9.16b,v9.16b 2806 rev32 v10.16b,v10.16b 2807 rev32 v11.16b,v11.16b 2808#endif 2809 bl Sm4Enc8blks 2810 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 2811 transpose v4.4s,v5.4s,v6.4s,v7.4s,v4.2d,v5.2d,v6.2d,v7.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 2812 2813 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[inp],#64 2814 ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[inp],#64 2815 eor v0.16b,v0.16b,v9.16b 2816 eor v1.16b,v1.16b,v10.16b 2817 eor v2.16b,v2.16b,v11.16b 2818 eor v3.16b,v3.16b,v8.16b 2819 // save back IV 2820 st1 {v15.4s}, [ivp] 2821 eor v4.16b,v4.16b,v12.16b 2822 eor v5.16b,v5.16b,v13.16b 2823 eor v6.16b,v6.16b,v14.16b 2824 eor v7.16b,v7.16b,v15.16b 2825 st1 {v3.4s},[outp],#16 2826 st1 {v0.4s,v1.4s,v2.4s},[outp],#48 2827 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[outp],#64 2828 subs len,len,#128 2829 b.gt .Lcfb128_8_blocks_dec 2830 b.eq 100f 2831.Lcfb128_4_blocks_dec: 2832 cmp len,#64 2833 b.lt .Llast_block 2834 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[inp] 2835 // append iv as last element 2836 ld4 {v4.s,v5.s,v6.s,v7.s}[3],[ivp] 2837#ifndef HITLS_BIG_ENDIAN 2838 rev32 v4.16b,v4.16b 2839 rev32 v5.16b,v5.16b 2840 rev32 v6.16b,v6.16b 2841 rev32 v7.16b,v7.16b 2842#endif 2843 bl Sm4Enc4blks 2844 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[inp],#64 2845 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 2846 eor v0.16b,v0.16b,v5.16b 2847 eor v1.16b,v1.16b,v6.16b 2848 eor v2.16b,v2.16b,v7.16b 2849 eor v3.16b,v3.16b,v4.16b 2850 st1 {v3.4s},[outp],#16 2851 st1 {v0.4s,v1.4s,v2.4s},[outp],#48 2852 // save back IV 2853 st1 {v7.4s}, [ivp] 2854 subs len,len,#64 2855 b.gt .Lcfb128_4_blocks_dec 2856 b.eq 100f 2857 2858.Llast_block: // last block 2859 cmp len,#16 2860 b.gt .Llast_2_blocks 28611: 2862 // load in 2863 ldp w6,w7,[inp] 2864 ldp w16,w17,[inp,#8] 2865 // load iv 2866 ldp w8,w9,[ivp] 2867 ldp w10,w11,[ivp,#8] 2868#ifndef HITLS_BIG_ENDIAN 2869 rev w8,w8 2870 rev w9,w9 2871 rev w10,w10 2872 rev w11,w11 2873#endif 2874 EncRound 2875#ifndef HITLS_BIG_ENDIAN 2876 rev w8,w8 2877 rev w9,w9 2878 rev w10,w10 2879 rev w11,w11 2880#endif 2881 // save encrypted iv 2882 stp w11,w10,[ivp] 2883 stp w9,w8,[ivp,#8] 2884 2885 cmp len,#16 2886 b.lt .Lcfb128_dec_final 2887 2888 stp w6,w7,[ivp] 2889 stp w16,w17,[ivp,#8] 2890 eor w11,w11,w6 2891 eor w10,w10,w7 2892 eor w9,w9,w16 2893 eor w8,w8,w17 2894 stp w11,w10,[outp],#8 2895 stp w9,w8,[outp],#8 2896 add inp,inp,#16 2897 subs len,len,#16 2898 b.gt 1b 2899 b.eq 100f 2900 b .Lcfb128_dec_final 2901.Llast_2_blocks: // last two blocks 2902 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[ivp] 2903 mov ptr,inp 2904 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[ptr],#16 2905 2906 cmp x2,#32 2907 b.gt .Llast_3_blocks 2908 b.lt 1b 29091: 2910#ifndef HITLS_BIG_ENDIAN 2911 rev32 v4.16b,v4.16b 2912 rev32 v5.16b,v5.16b 2913 rev32 v6.16b,v6.16b 2914 rev32 v7.16b,v7.16b 2915#endif 2916 bl Sm4Enc4blks 2917 ld1 {v4.4s,v5.4s},[inp],#32 2918 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 2919 eor v0.16b,v0.16b,v4.16b 2920 eor v1.16b,v1.16b,v5.16b 2921 st1 {v0.4s,v1.4s},[outp],#32 2922 // save back IV 2923 st1 {v5.4s}, [ivp] 2924 subs len,len,#32 2925 b.eq 100f 2926 b .Llast_block 2927.Llast_3_blocks: // last 3 blocks 2928 cmp len,#48 2929 b.lt 1b 2930 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[ptr] 2931#ifndef HITLS_BIG_ENDIAN 2932 rev32 v4.16b,v4.16b 2933 rev32 v5.16b,v5.16b 2934 rev32 v6.16b,v6.16b 2935 rev32 v7.16b,v7.16b 2936#endif 2937 bl Sm4Enc4blks 2938 ld1 {v4.4s,v5.4s,v6.4s},[inp],#48 2939 transpose v0.4s,v1.4s,v2.4s,v3.4s,v0.2d,v1.2d,v2.2d,v3.2d,v8.4s,v9.4s,v10.4s,v11.4s,v8.2d,v9.2d,v10.2d,v11.2d 2940 eor v0.16b,v0.16b,v4.16b 2941 eor v1.16b,v1.16b,v5.16b 2942 eor v2.16b,v2.16b,v6.16b 2943 st1 {v0.4s,v1.4s,v2.4s},[outp],#48 2944 // save back IV 2945 st1 {v6.4s}, [ivp] 2946 subs len,len,#48 2947 b.eq 100f 2948 b .Llast_block 2949.Lcfb128_dec_final: 2950 ldrb w7,[ivp,x23] 2951 ldrb w8,[inp] 2952 eor w7,w7,w8 2953 strb w7,[outp] 2954 // store in to iv 2955 strb w8,[ivp,x23] 2956 2957 add inp,inp,#1 2958 add outp,outp,#1 2959 add w23,w23,#1 2960 subs len,len,#1 2961 b.ne .Lcfb128_dec_final 2962100: 2963 // store num 2964 str w23,[x5] 2965 ldp x19,x20,[sp,#16] 2966 ldp x21,x22,[sp,#32] 2967 ldp x23,x24,[sp,#48] 2968 ldp d8,d9,[sp,#64] 2969 ldp d10,d11,[sp,#80] 2970 ldp d12,d13,[sp,#96] 2971 ldp d14,d15,[sp,#112] 2972 ldp x29,x30,[sp],#128 2973AARCH64_AUTIASP 2974 ret 2975.size Vpsm4Cfb128Decrypt,.-Vpsm4Cfb128Decrypt 2976 2977#endif 2978