1 #include <string.h>
2 #include <stdint.h>
3 #include <endian.h>
4
5 #ifdef __GNUC__
6 #if __BYTE_ORDER == __LITTLE_ENDIAN
7 #define LS >>
8 #define RS <<
9 #else
10 #define LS <<
11 #define RS >>
12 #endif
13
14 #define MEMCPY_BYTE_BITS 8
15 #define MEMCPY_ALIGH_UNIT_BYTES_4 4
16 #define MEMCPY_FAST_COPY_UNIT_NUM_4 4
17 #define MEMCPY_FAST_COPY_UNIT_BYTES (MEMCPY_ALIGH_UNIT_BYTES_4 * MEMCPY_FAST_COPY_UNIT_NUM_4)
18
19 #define MEMCPY_ALIGH_UNIT_BITS (MEMCPY_BYTE_BITS * MEMCPY_ALIGH_UNIT_BYTES_4)
20
21 #define MEMCPY_FAST_COPY_OFFSET_UNIT_0 (MEMCPY_ALIGH_UNIT_BYTES_4 * 0)
22 #define MEMCPY_FAST_COPY_OFFSET_UNIT_1 (MEMCPY_ALIGH_UNIT_BYTES_4 * 1)
23 #define MEMCPY_FAST_COPY_OFFSET_UNIT_2 (MEMCPY_ALIGH_UNIT_BYTES_4 * 2)
24 #define MEMCPY_FAST_COPY_OFFSET_UNIT_3 (MEMCPY_ALIGH_UNIT_BYTES_4 * 3)
25
26 #define MEMCPY_NOT_ALIGN_FAST_COPY_THRESHOLD (MEMCPY_FAST_COPY_UNIT_BYTES * 2)
27
28 #define MEMCPY_OFFSET_BYTES_1 1
29 #define MEMCPY_ALIGH_OFFSET_BYTES_1 (MEMCPY_ALIGH_UNIT_BYTES_4 - MEMCPY_OFFSET_BYTES_1)
30 #define MEMCPY_OFFSET_BITS_1 (MEMCPY_BYTE_BITS * MEMCPY_OFFSET_BYTES_1)
31 #define MEMCPY_OFFSET_ALIGN_BITS_1 (MEMCPY_BYTE_BITS * MEMCPY_ALIGH_OFFSET_BYTES_1)
32
33 #define MEMCPY_OFFSET_BYTES_2 2
34 #define MEMCPY_ALIGH_OFFSET_BYTES_2 (MEMCPY_ALIGH_UNIT_BYTES_4 - MEMCPY_OFFSET_BYTES_2)
35 #define MEMCPY_OFFSET_BITS_2 (MEMCPY_BYTE_BITS * MEMCPY_OFFSET_BYTES_2)
36 #define MEMCPY_OFFSET_ALIGN_BITS_2 (MEMCPY_BYTE_BITS * MEMCPY_ALIGH_OFFSET_BYTES_2)
37
38 #define MEMCPY_OFFSET_BYTES_3 3
39 #define MEMCPY_ALIGH_OFFSET_BYTES_3 (MEMCPY_ALIGH_UNIT_BYTES_4 - MEMCPY_OFFSET_BYTES_3)
40 #define MEMCPY_OFFSET_BITS_3 (MEMCPY_BYTE_BITS * MEMCPY_OFFSET_BYTES_3)
41 #define MEMCPY_OFFSET_ALIGN_BITS_3 (MEMCPY_BYTE_BITS * MEMCPY_ALIGH_OFFSET_BYTES_3)
42
43 #define MEMCPY_BYTE_CHECK_NUM_1 0x01
44 #define MEMCPY_BYTE_CHECK_NUM_2 0x02
45 #define MEMCPY_BYTE_CHECK_NUM_4 0x04
46 #define MEMCPY_BYTE_CHECK_NUM_8 0x08
47 #define MEMCPY_BYTE_CHECK_NUM_16 0x10
48 #endif
49
memcpy(void * restrict dest,const void * restrict src,size_t num)50 void *memcpy(void *restrict dest, const void *restrict src, size_t num)
51 {
52 unsigned char *d = dest;
53 const unsigned char *s = src;
54 size_t n = num;
55
56 #ifdef __GNUC__
57 typedef uint32_t __attribute__((__may_alias__)) u32;
58 uint32_t w, x;
59
60 for (; (uintptr_t)s % MEMCPY_ALIGH_UNIT_BYTES_4 && n; n--) {
61 *d++ = *s++;
62 }
63
64 if ((uintptr_t)d % MEMCPY_ALIGH_UNIT_BYTES_4 == 0) {
65 for (; n >= MEMCPY_FAST_COPY_UNIT_BYTES; s += MEMCPY_FAST_COPY_UNIT_BYTES,
66 d += MEMCPY_FAST_COPY_UNIT_BYTES, n -= MEMCPY_FAST_COPY_UNIT_BYTES) {
67 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_0) = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_0);
68 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_1) = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_1);
69 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_2) = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_2);
70 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_3) = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_3);
71 }
72 if (n & MEMCPY_FAST_COPY_OFFSET_UNIT_2) {
73 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_0) = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_0);
74 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_1) = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_1);
75 d += MEMCPY_FAST_COPY_OFFSET_UNIT_2;
76 s += MEMCPY_FAST_COPY_OFFSET_UNIT_2;
77 }
78 if (n & MEMCPY_FAST_COPY_OFFSET_UNIT_1) {
79 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_0) = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_0);
80 d += MEMCPY_FAST_COPY_OFFSET_UNIT_1;
81 s += MEMCPY_FAST_COPY_OFFSET_UNIT_1;
82 }
83 if (n & MEMCPY_BYTE_CHECK_NUM_2) {
84 *d++ = *s++; *d++ = *s++;
85 }
86 if (n & MEMCPY_BYTE_CHECK_NUM_1) {
87 *d = *s;
88 }
89 return dest;
90 }
91
92 if (n >= MEMCPY_NOT_ALIGN_FAST_COPY_THRESHOLD) {
93 switch ((uintptr_t)d % MEMCPY_ALIGH_UNIT_BYTES_4) {
94 case MEMCPY_OFFSET_BYTES_1:
95 w = *(u32 *)s;
96 *d++ = *s++;
97 *d++ = *s++;
98 *d++ = *s++;
99 n -= MEMCPY_ALIGH_OFFSET_BYTES_1;
100 for (; n >= MEMCPY_FAST_COPY_UNIT_BYTES + MEMCPY_OFFSET_BYTES_1; s += MEMCPY_FAST_COPY_UNIT_BYTES,
101 d += MEMCPY_FAST_COPY_UNIT_BYTES, n -= MEMCPY_FAST_COPY_UNIT_BYTES) {
102 x = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_0 + MEMCPY_OFFSET_BYTES_1);
103 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_0) =
104 (w LS MEMCPY_OFFSET_ALIGN_BITS_1) | (x RS MEMCPY_OFFSET_BITS_1);
105 w = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_1 + MEMCPY_OFFSET_BYTES_1);
106 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_1) =
107 (x LS MEMCPY_OFFSET_ALIGN_BITS_1) | (w RS MEMCPY_OFFSET_BITS_1);
108 x = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_2 + MEMCPY_OFFSET_BYTES_1);
109 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_2) =
110 (w LS MEMCPY_OFFSET_ALIGN_BITS_1) | (x RS MEMCPY_OFFSET_BITS_1);
111 w = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_3 + MEMCPY_OFFSET_BYTES_1);
112 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_3) =
113 (x LS MEMCPY_OFFSET_ALIGN_BITS_1) | (w RS MEMCPY_OFFSET_BITS_1);
114 }
115 break;
116 case MEMCPY_OFFSET_BYTES_2:
117 w = *(u32 *)s;
118 *d++ = *s++;
119 *d++ = *s++;
120 n -= MEMCPY_ALIGH_OFFSET_BYTES_2;
121 for (; n >= MEMCPY_FAST_COPY_UNIT_BYTES + MEMCPY_OFFSET_BYTES_2; s += MEMCPY_FAST_COPY_UNIT_BYTES,
122 d += MEMCPY_FAST_COPY_UNIT_BYTES, n -= MEMCPY_FAST_COPY_UNIT_BYTES) {
123 x = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_0 + MEMCPY_OFFSET_BYTES_2);
124 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_0) =
125 (w LS MEMCPY_OFFSET_ALIGN_BITS_2) | (x RS MEMCPY_OFFSET_BITS_2);
126 w = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_1 + MEMCPY_OFFSET_BYTES_2);
127 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_1) =
128 (x LS MEMCPY_OFFSET_ALIGN_BITS_2) | (w RS MEMCPY_OFFSET_BITS_2);
129 x = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_2 + MEMCPY_OFFSET_BYTES_2);
130 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_2) =
131 (w LS MEMCPY_OFFSET_ALIGN_BITS_2) | (x RS MEMCPY_OFFSET_BITS_2);
132 w = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_3 + MEMCPY_OFFSET_BYTES_2);
133 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_3) =
134 (x LS MEMCPY_OFFSET_ALIGN_BITS_2) | (w RS MEMCPY_OFFSET_BITS_2);
135 }
136 break;
137 case MEMCPY_OFFSET_BYTES_3:
138 w = *(u32 *)s;
139 *d++ = *s++;
140 n -= MEMCPY_ALIGH_OFFSET_BYTES_3;
141 for (; n >= MEMCPY_FAST_COPY_UNIT_BYTES + MEMCPY_OFFSET_BYTES_3; s += MEMCPY_FAST_COPY_UNIT_BYTES,
142 d += MEMCPY_FAST_COPY_UNIT_BYTES, n -= MEMCPY_FAST_COPY_UNIT_BYTES) {
143 x = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_0 + MEMCPY_OFFSET_BYTES_3);
144 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_0) =
145 (w LS MEMCPY_OFFSET_ALIGN_BITS_3) | (x RS MEMCPY_OFFSET_BITS_3);
146 w = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_1 + MEMCPY_OFFSET_BYTES_3);
147 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_1) =
148 (x LS MEMCPY_OFFSET_ALIGN_BITS_3) | (w RS MEMCPY_OFFSET_BITS_3);
149 x = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_2 + MEMCPY_OFFSET_BYTES_3);
150 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_2) =
151 (w LS MEMCPY_OFFSET_ALIGN_BITS_3) | (x RS MEMCPY_OFFSET_BITS_3);
152 w = *(u32 *)(s + MEMCPY_FAST_COPY_OFFSET_UNIT_3 + MEMCPY_OFFSET_BYTES_3);
153 *(u32 *)(d + MEMCPY_FAST_COPY_OFFSET_UNIT_3) =
154 (x LS MEMCPY_OFFSET_ALIGN_BITS_3) | (w RS MEMCPY_OFFSET_BITS_3);
155 }
156 break;
157 }
158 }
159 if (n & MEMCPY_BYTE_CHECK_NUM_16) {
160 *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
161 *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
162 *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
163 *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
164 }
165 if (n & MEMCPY_BYTE_CHECK_NUM_8) {
166 *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
167 *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
168 }
169 if (n & MEMCPY_BYTE_CHECK_NUM_4) {
170 *d++ = *s++; *d++ = *s++; *d++ = *s++; *d++ = *s++;
171 }
172 if (n & MEMCPY_BYTE_CHECK_NUM_2) {
173 *d++ = *s++; *d++ = *s++;
174 }
175 if (n & MEMCPY_BYTE_CHECK_NUM_1) {
176 *d = *s;
177 }
178 return dest;
179 #endif
180
181 for (; n; n--) *d++ = *s++;
182 return dest;
183 }
184