1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build amd64,!gccgo,!appengine 6 7package curve25519 8 9// These functions are implemented in the .s files. The names of the functions 10// in the rest of the file are also taken from the SUPERCOP sources to help 11// people following along. 12 13//go:noescape 14 15func cswap(inout *[5]uint64, v uint64) 16 17//go:noescape 18 19func ladderstep(inout *[5][5]uint64) 20 21//go:noescape 22 23func freeze(inout *[5]uint64) 24 25//go:noescape 26 27func mul(dest, a, b *[5]uint64) 28 29//go:noescape 30 31func square(out, in *[5]uint64) 32 33// mladder uses a Montgomery ladder to calculate (xr/zr) *= s. 34func mladder(xr, zr *[5]uint64, s *[32]byte) { 35 var work [5][5]uint64 36 37 work[0] = *xr 38 setint(&work[1], 1) 39 setint(&work[2], 0) 40 work[3] = *xr 41 setint(&work[4], 1) 42 43 j := uint(6) 44 var prevbit byte 45 46 for i := 31; i >= 0; i-- { 47 for j < 8 { 48 bit := ((*s)[i] >> j) & 1 49 swap := bit ^ prevbit 50 prevbit = bit 51 cswap(&work[1], uint64(swap)) 52 ladderstep(&work) 53 j-- 54 } 55 j = 7 56 } 57 58 *xr = work[1] 59 *zr = work[2] 60} 61 62func scalarMult(out, in, base *[32]byte) { 63 var e [32]byte 64 copy(e[:], (*in)[:]) 65 e[0] &= 248 66 e[31] &= 127 67 e[31] |= 64 68 69 var t, z [5]uint64 70 unpack(&t, base) 71 mladder(&t, &z, &e) 72 invert(&z, &z) 73 mul(&t, &t, &z) 74 pack(out, &t) 75} 76 77func setint(r *[5]uint64, v uint64) { 78 r[0] = v 79 r[1] = 0 80 r[2] = 0 81 r[3] = 0 82 r[4] = 0 83} 84 85// unpack sets r = x where r consists of 5, 51-bit limbs in little-endian 86// order. 87func unpack(r *[5]uint64, x *[32]byte) { 88 r[0] = uint64(x[0]) | 89 uint64(x[1])<<8 | 90 uint64(x[2])<<16 | 91 uint64(x[3])<<24 | 92 uint64(x[4])<<32 | 93 uint64(x[5])<<40 | 94 uint64(x[6]&7)<<48 95 96 r[1] = uint64(x[6])>>3 | 97 uint64(x[7])<<5 | 98 uint64(x[8])<<13 | 99 uint64(x[9])<<21 | 100 uint64(x[10])<<29 | 101 uint64(x[11])<<37 | 102 uint64(x[12]&63)<<45 103 104 r[2] = uint64(x[12])>>6 | 105 uint64(x[13])<<2 | 106 uint64(x[14])<<10 | 107 uint64(x[15])<<18 | 108 uint64(x[16])<<26 | 109 uint64(x[17])<<34 | 110 uint64(x[18])<<42 | 111 uint64(x[19]&1)<<50 112 113 r[3] = uint64(x[19])>>1 | 114 uint64(x[20])<<7 | 115 uint64(x[21])<<15 | 116 uint64(x[22])<<23 | 117 uint64(x[23])<<31 | 118 uint64(x[24])<<39 | 119 uint64(x[25]&15)<<47 120 121 r[4] = uint64(x[25])>>4 | 122 uint64(x[26])<<4 | 123 uint64(x[27])<<12 | 124 uint64(x[28])<<20 | 125 uint64(x[29])<<28 | 126 uint64(x[30])<<36 | 127 uint64(x[31]&127)<<44 128} 129 130// pack sets out = x where out is the usual, little-endian form of the 5, 131// 51-bit limbs in x. 132func pack(out *[32]byte, x *[5]uint64) { 133 t := *x 134 freeze(&t) 135 136 out[0] = byte(t[0]) 137 out[1] = byte(t[0] >> 8) 138 out[2] = byte(t[0] >> 16) 139 out[3] = byte(t[0] >> 24) 140 out[4] = byte(t[0] >> 32) 141 out[5] = byte(t[0] >> 40) 142 out[6] = byte(t[0] >> 48) 143 144 out[6] ^= byte(t[1]<<3) & 0xf8 145 out[7] = byte(t[1] >> 5) 146 out[8] = byte(t[1] >> 13) 147 out[9] = byte(t[1] >> 21) 148 out[10] = byte(t[1] >> 29) 149 out[11] = byte(t[1] >> 37) 150 out[12] = byte(t[1] >> 45) 151 152 out[12] ^= byte(t[2]<<6) & 0xc0 153 out[13] = byte(t[2] >> 2) 154 out[14] = byte(t[2] >> 10) 155 out[15] = byte(t[2] >> 18) 156 out[16] = byte(t[2] >> 26) 157 out[17] = byte(t[2] >> 34) 158 out[18] = byte(t[2] >> 42) 159 out[19] = byte(t[2] >> 50) 160 161 out[19] ^= byte(t[3]<<1) & 0xfe 162 out[20] = byte(t[3] >> 7) 163 out[21] = byte(t[3] >> 15) 164 out[22] = byte(t[3] >> 23) 165 out[23] = byte(t[3] >> 31) 166 out[24] = byte(t[3] >> 39) 167 out[25] = byte(t[3] >> 47) 168 169 out[25] ^= byte(t[4]<<4) & 0xf0 170 out[26] = byte(t[4] >> 4) 171 out[27] = byte(t[4] >> 12) 172 out[28] = byte(t[4] >> 20) 173 out[29] = byte(t[4] >> 28) 174 out[30] = byte(t[4] >> 36) 175 out[31] = byte(t[4] >> 44) 176} 177 178// invert calculates r = x^-1 mod p using Fermat's little theorem. 179func invert(r *[5]uint64, x *[5]uint64) { 180 var z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t [5]uint64 181 182 square(&z2, x) /* 2 */ 183 square(&t, &z2) /* 4 */ 184 square(&t, &t) /* 8 */ 185 mul(&z9, &t, x) /* 9 */ 186 mul(&z11, &z9, &z2) /* 11 */ 187 square(&t, &z11) /* 22 */ 188 mul(&z2_5_0, &t, &z9) /* 2^5 - 2^0 = 31 */ 189 190 square(&t, &z2_5_0) /* 2^6 - 2^1 */ 191 for i := 1; i < 5; i++ { /* 2^20 - 2^10 */ 192 square(&t, &t) 193 } 194 mul(&z2_10_0, &t, &z2_5_0) /* 2^10 - 2^0 */ 195 196 square(&t, &z2_10_0) /* 2^11 - 2^1 */ 197 for i := 1; i < 10; i++ { /* 2^20 - 2^10 */ 198 square(&t, &t) 199 } 200 mul(&z2_20_0, &t, &z2_10_0) /* 2^20 - 2^0 */ 201 202 square(&t, &z2_20_0) /* 2^21 - 2^1 */ 203 for i := 1; i < 20; i++ { /* 2^40 - 2^20 */ 204 square(&t, &t) 205 } 206 mul(&t, &t, &z2_20_0) /* 2^40 - 2^0 */ 207 208 square(&t, &t) /* 2^41 - 2^1 */ 209 for i := 1; i < 10; i++ { /* 2^50 - 2^10 */ 210 square(&t, &t) 211 } 212 mul(&z2_50_0, &t, &z2_10_0) /* 2^50 - 2^0 */ 213 214 square(&t, &z2_50_0) /* 2^51 - 2^1 */ 215 for i := 1; i < 50; i++ { /* 2^100 - 2^50 */ 216 square(&t, &t) 217 } 218 mul(&z2_100_0, &t, &z2_50_0) /* 2^100 - 2^0 */ 219 220 square(&t, &z2_100_0) /* 2^101 - 2^1 */ 221 for i := 1; i < 100; i++ { /* 2^200 - 2^100 */ 222 square(&t, &t) 223 } 224 mul(&t, &t, &z2_100_0) /* 2^200 - 2^0 */ 225 226 square(&t, &t) /* 2^201 - 2^1 */ 227 for i := 1; i < 50; i++ { /* 2^250 - 2^50 */ 228 square(&t, &t) 229 } 230 mul(&t, &t, &z2_50_0) /* 2^250 - 2^0 */ 231 232 square(&t, &t) /* 2^251 - 2^1 */ 233 square(&t, &t) /* 2^252 - 2^2 */ 234 square(&t, &t) /* 2^253 - 2^3 */ 235 236 square(&t, &t) /* 2^254 - 2^4 */ 237 238 square(&t, &t) /* 2^255 - 2^5 */ 239 mul(r, &t, &z11) /* 2^255 - 21 */ 240} 241