| 881 |
theseven |
1 |
#include "global.h"
|
|
|
2 |
#include "sys/util.h"
|
|
|
3 |
|
|
|
4 |
void enter_critical_section()
|
|
|
5 |
{
|
|
|
6 |
#ifdef CPU_ARM_CORTEX
|
|
|
7 |
__asm__ volatile("cpsid if");
|
|
|
8 |
#else
|
|
|
9 |
__asm__ volatile(
|
|
|
10 |
"mrs r0, cpsr \n\t"
|
|
|
11 |
"orr r0, r0, #0xc0 \n\t"
|
|
|
12 |
"msr cpsr_c, r0 \n\t"
|
|
|
13 |
::: "r0");
|
|
|
14 |
#endif
|
|
|
15 |
}
|
|
|
16 |
|
|
|
17 |
void leave_critical_section()
|
|
|
18 |
{
|
|
|
19 |
#ifdef CPU_ARM_CORTEX
|
|
|
20 |
__asm__ volatile("cpsie if");
|
|
|
21 |
#else
|
|
|
22 |
__asm__ volatile(
|
|
|
23 |
"mrs r0, cpsr \n\t"
|
|
|
24 |
"bic r0, r0, #0xc0 \n\t"
|
|
|
25 |
"msr cpsr_c, r0 \n\t"
|
|
|
26 |
::: "r0");
|
|
|
27 |
#endif
|
|
|
28 |
}
|
|
|
29 |
|
|
|
30 |
void clean_dcache(const void* addr, uint32_t len)
|
|
|
31 |
{
|
|
|
32 |
#ifdef CPU_ARM_OLD
|
|
|
33 |
__asm__ volatile(
|
|
|
34 |
"mov r0, #0 \n\t"
|
|
|
35 |
#if defined(CPU_ARM_OLD_V6)
|
|
|
36 |
"mcr p15, 0, r0,c7,c10,0 \n\t"
|
|
|
37 |
#elif defined(CPU_ARM_OLD_V5)
|
|
|
38 |
"0: \n\t"
|
|
|
39 |
"mrc p15, 0, r15,c7,c10,3 \n\t"
|
|
|
40 |
"bne 0b \n\t"
|
|
|
41 |
#elif defined(CPU_ARM_OLD_V4)
|
|
|
42 |
"0: \n\t"
|
|
|
43 |
"mcr p15, 0, r0,c7,c10,2 \n\t"
|
|
|
44 |
"add r1, r0, #0x10 \n\t"
|
|
|
45 |
"mcr p15, 0, r1,c7,c10,2 \n\t"
|
|
|
46 |
"add r1, r1, #0x10 \n\t"
|
|
|
47 |
"mcr p15, 0, r1,c7,c10,2 \n\t"
|
|
|
48 |
"add r1, r1, #0x10 \n\t"
|
|
|
49 |
"mcr p15, 0, r1,c7,c10,2 \n\t"
|
|
|
50 |
"adds r0, r0, #0x04000000 \n\t"
|
|
|
51 |
"bne 0b \n\t"
|
|
|
52 |
#else
|
|
|
53 |
#error Please implement invalidate_dcache for this ARM core generation!
|
|
|
54 |
#endif
|
|
|
55 |
"mcr p15, 0, r0,c7,c10,4 \n\t"
|
|
|
56 |
::: "r0", "r1");
|
|
|
57 |
#endif
|
|
|
58 |
}
|
|
|
59 |
|
|
|
60 |
void invalidate_dcache(const void* addr, uint32_t len)
|
|
|
61 |
{
|
|
|
62 |
#ifdef CPU_ARM_OLD
|
|
|
63 |
__asm__ volatile(
|
|
|
64 |
"mov r0, #0 \n\t"
|
|
|
65 |
#if defined(CPU_ARM_OLD_V6)
|
|
|
66 |
"mcr p15, 0, r0,c7,c14,0 \n\t"
|
|
|
67 |
#elif defined(CPU_ARM_OLD_V5)
|
|
|
68 |
"0: \n\t"
|
|
|
69 |
"mrc p15, 0, r15,c7,c14,3 \n\t"
|
|
|
70 |
"bne 0b \n\t"
|
|
|
71 |
#elif defined(CPU_ARM_OLD_V4)
|
|
|
72 |
"0: \n\t"
|
|
|
73 |
"mcr p15, 0, r0,c7,c14,2 \n\t"
|
|
|
74 |
"add r1, r0, #0x10 \n\t"
|
|
|
75 |
"mcr p15, 0, r1,c7,c14,2 \n\t"
|
|
|
76 |
"add r1, r1, #0x10 \n\t"
|
|
|
77 |
"mcr p15, 0, r1,c7,c14,2 \n\t"
|
|
|
78 |
"add r1, r1, #0x10 \n\t"
|
|
|
79 |
"mcr p15, 0, r1,c7,c14,2 \n\t"
|
|
|
80 |
"adds r0, r0, #0x04000000 \n\t"
|
|
|
81 |
"bne 0b \n\t"
|
|
|
82 |
#else
|
|
|
83 |
#error Please implement invalidate_dcache for this ARM core generation!
|
|
|
84 |
#endif
|
|
|
85 |
"mcr p15, 0, r0,c7,c10,4 \n\t"
|
|
|
86 |
::: "r0", "r1");
|
|
|
87 |
#endif
|
|
|
88 |
}
|
|
|
89 |
|
|
|
90 |
void invalidate_icache(const void* addr, uint32_t len)
|
|
|
91 |
{
|
|
|
92 |
#ifdef CPU_ARM_OLD
|
|
|
93 |
__asm__ volatile(
|
|
|
94 |
"mov r0, #0 \n\t"
|
|
|
95 |
"mcr p15, 0, r0,c7,c5,0 \n\t"
|
|
|
96 |
#ifdef CPU_ARM_OLD_V6
|
|
|
97 |
"mcr p15, 0, r0,c7,c5,4 \n\t"
|
|
|
98 |
#endif
|
|
|
99 |
::: "r0");
|
|
|
100 |
#endif
|
|
|
101 |
}
|
|
|
102 |
|
|
|
103 |
void enable_mmu()
|
|
|
104 |
{
|
|
|
105 |
#ifdef CPU_ARM_OLD
|
|
|
106 |
__asm__ volatile(
|
|
|
107 |
"mrc p15, 0, r0,c1,c0 \n\t"
|
|
|
108 |
"orr r1, r1, #0x00001000 \n\t"
|
|
|
109 |
"orr r1, r1, #0x00000005 \n\t"
|
|
|
110 |
"mcr p15, 0, r0,c1,c0 \n\t"
|
|
|
111 |
::: "r0");
|
|
|
112 |
#endif
|
|
|
113 |
}
|
|
|
114 |
|
|
|
115 |
void disable_mmu()
|
|
|
116 |
{
|
|
|
117 |
#ifdef CPU_ARM_OLD
|
|
|
118 |
__asm__ volatile(
|
|
|
119 |
"mrc p15, 0, r0,c1,c0 \n\t"
|
|
|
120 |
"bic r1, r1, #0x00001000 \n\t"
|
|
|
121 |
"bic r1, r1, #0x00000005 \n\t"
|
|
|
122 |
"mcr p15, 0, r0,c1,c0 \n\t"
|
|
|
123 |
::: "r0");
|
|
|
124 |
#endif
|
|
|
125 |
}
|
|
|
126 |
|
|
|
127 |
__attribute__((naked, noinline)) void* memset(void* dst_void, int val, size_t len)
|
|
|
128 |
{
|
|
|
129 |
__asm__ volatile(
|
|
|
130 |
".syntax unified \n"
|
|
|
131 |
" mov r3, r0 \n"
|
|
|
132 |
#ifndef SQUEEZE
|
|
|
133 |
" cmp r2, #8 \n"
|
|
|
134 |
" blt 2f \n"
|
|
|
135 |
" orr r1, r1, r1, lsl #8 \n"
|
|
|
136 |
" orr r1, r1, r1, lsl #16 \n"
|
|
|
137 |
"1: \n"
|
|
|
138 |
" tst r3, #3 \n"
|
|
|
139 |
" itt ne \n"
|
|
|
140 |
" strbne r1, [r3], #1 \n"
|
|
|
141 |
" subne r2, r2, #1 \n"
|
|
|
142 |
" bne 1b \n"
|
|
|
143 |
" mov r12, r1 \n"
|
|
|
144 |
"1: \n"
|
|
|
145 |
" cmp r2, #8 \n"
|
|
|
146 |
" blt 2f \n"
|
|
|
147 |
" stmia r3!, {r1, r12} \n"
|
|
|
148 |
" sub r2, r2, #8 \n"
|
|
|
149 |
" cmp r2, #8 \n"
|
|
|
150 |
" blt 2f \n"
|
|
|
151 |
" stmia r3!, {r1, r12} \n"
|
|
|
152 |
" sub r2, r2, #8 \n"
|
|
|
153 |
" cmp r2, #8 \n"
|
|
|
154 |
" blt 2f \n"
|
|
|
155 |
" stmia r3!, {r1, r12} \n"
|
|
|
156 |
" sub r2, r2, #8 \n"
|
|
|
157 |
" cmp r2, #8 \n"
|
|
|
158 |
" itt ge \n"
|
|
|
159 |
" stmiage r3!, {r1, r12} \n"
|
|
|
160 |
" subge r2, r2, #8 \n"
|
|
|
161 |
" bge 1b \n"
|
|
|
162 |
"2: \n"
|
|
|
163 |
#endif
|
|
|
164 |
" movs r2, r2 \n"
|
|
|
165 |
" it eq \n"
|
|
|
166 |
" bxeq lr \n"
|
|
|
167 |
"1: \n"
|
|
|
168 |
" strb r1, [r3], #1 \n"
|
|
|
169 |
" subs r2, r2, #1 \n"
|
|
|
170 |
" bne 1b \n"
|
|
|
171 |
" bx lr \n"
|
|
|
172 |
);
|
|
|
173 |
}
|
|
|
174 |
|
|
|
175 |
__attribute__((naked, noinline)) void* memmove(void* dst, const void* src, size_t len)
|
|
|
176 |
{
|
|
|
177 |
__asm__ volatile(
|
|
|
178 |
".syntax unified \n"
|
|
|
179 |
// Copy memory forward or backward, allows overlapping src/dest
|
|
|
180 |
// args: R0: dest, R1: src, R2: len; returns: dest
|
|
|
181 |
" cmp r1, r0 \n" // TEST src - dst
|
|
|
182 |
// If src == dst we don't need to do anything at all
|
|
|
183 |
" it eq \n" // IF src == dest:
|
|
|
184 |
" bxeq lr \n" // THEN return dest
|
|
|
185 |
" stmfd sp!, {r0, lr} \n" // SAVE return_addr orig_dest // STACK: orig_dest return_addr
|
|
|
186 |
// Check if we need to go backwards (if source < destination and there is overlap)
|
|
|
187 |
" itt cc \n" // IF src < dest:
|
|
|
188 |
" addcc r3, r1, r2 \n" // THEN src_end = src + len
|
|
|
189 |
" cmpcc r0, r3 \n" // THEN TEST dest - src_end
|
|
|
190 |
" bcc 9f \n" // IF (src < dest) AND (dest < src_end): THEN GOTO [backward]
|
|
|
191 |
" \n" // (fallthrough)
|
|
|
192 |
|
|
|
193 |
#if defined(SQUEEZE)
|
|
|
194 |
// Absolute minimum size forward implementation:
|
|
|
195 |
"1: \n" // DO:
|
|
|
196 |
// Stupidly copy a byte at a time
|
|
|
197 |
" subs r2, r2, #1 \n" // len--
|
|
|
198 |
" it lt \n" // IF len < 0:
|
|
|
199 |
" ldmfdlt sp!, {r0, pc} \n" // THEN RETURN orig_dest
|
|
|
200 |
" ldrb r3, [r1], #1 \n" // R3 = *src++ (byte)
|
|
|
201 |
" strb r3, [r0], #1 \n" // *dest++ = R3 (byte)
|
|
|
202 |
" b 1b \n" // WHILE true
|
|
|
203 |
// End of absolute minimum size forward implementation
|
|
|
204 |
|
|
|
205 |
#elif defined(OPTIMIZE_SIZE)
|
|
|
206 |
// Size optimized forward implementation:
|
|
|
207 |
// Skip to tail if we have to copy less than 4 bytes
|
|
|
208 |
" subs r2, r2, #4 \n" // len -= 4 // len offset: -4
|
|
|
209 |
" blt 2f \n" // IF len < 0: THEN GOTO [remainder]
|
|
|
210 |
// Skip to tail if src or dest are misaligned
|
|
|
211 |
" tst r0, #3 \n" // TEST dest & 3
|
|
|
212 |
" it eq \n" // IF !(dest & 3):
|
|
|
213 |
" tsteq r1, #3 \n" // THEN TEST src & 3
|
|
|
214 |
" bne 2f \n" // IF (dest & 3) OR (src & 3): THEN GOTO [remainder]
|
|
|
215 |
// Copy 4 bytes at a time until less than 4 are remaining
|
|
|
216 |
"1: \n" // DO:
|
|
|
217 |
" ldr r3, [r1], #4 \n" // R3 = *src++ (word)
|
|
|
218 |
" str r3, [r0], #4 \n" // *dest++ = R3 (word)
|
|
|
219 |
" subs r2, r2, #4 \n" // len -= 4
|
|
|
220 |
" bge 1b \n" // WHILE: len >= 0
|
|
|
221 |
" \n" // (fallthrough)
|
|
|
222 |
" \n" //
|
|
|
223 |
"2: \n" // [remainder]: copy the remainder byte by byte
|
|
|
224 |
// Correct length offset from 4 byte copying mode
|
|
|
225 |
" adds r2, r2, #4 \n" // len += 4 // len offset: 0
|
|
|
226 |
"1: \n" // DO:
|
|
|
227 |
// Copy the remainder a byte at a time
|
|
|
228 |
" it eq \n" // IF len == 0:
|
|
|
229 |
" ldmfdeq sp!, {r0, pc} \n" // THEN RETURN orig_dest
|
|
|
230 |
" ldrb r3, [r1], #1 \n" // R3 = *src++ (byte)
|
|
|
231 |
" strb r3, [r0], #1 \n" // *dest++ = R3 (byte)
|
|
|
232 |
" subs r2, r2, #1 \n" // len--
|
|
|
233 |
" b 1b \n" // WHILE true
|
|
|
234 |
// End of size optimized forward implementation
|
|
|
235 |
#else
|
|
|
236 |
|
|
|
237 |
// Performance optimized forward implementation:
|
|
|
238 |
// Skip to 3 byte tail if we have to copy less than 4 bytes
|
|
|
239 |
" subs r2, r2, #4 \n" // len -= 4 // len offset: -4
|
|
|
240 |
" blt 6f \n" // IF len < 0: THEN GOTO [remainder3]
|
|
|
241 |
// If the destination is misaligned, align it
|
|
|
242 |
// This will return to label 3 if the source is aligned after the destination was aligned
|
|
|
243 |
" ands r12, r0, #3 \n" // misalignment = dest & 3
|
|
|
244 |
" bne 7f \n" // IF misalignment != 0: THEN GOTO [align]
|
|
|
245 |
// The destination was already aligned, but the source isn't
|
|
|
246 |
// We have no option but to fall back to byte by byte copying
|
|
|
247 |
" tst r1, #3 \n" // TEST src & 3
|
|
|
248 |
" bne 8f \n" // IF src & 3: THEN GOTO [bytewise]
|
|
|
249 |
" \n" // (fallthrough)
|
|
|
250 |
" \n" //
|
|
|
251 |
"3: \n" // [aligned]: we are at least 4 byte aligned
|
|
|
252 |
// Skip to 11 byte tail if we have to copy less than 12 bytes
|
|
|
253 |
" subs r2, r2, #8 \n" // len -= 8 // len offset: -12
|
|
|
254 |
" blt 5f \n" // IF len < 0: THEN GOTO [remainder11]
|
|
|
255 |
// Skip to 31 byte tail if we have to copy less than 32 bytes
|
|
|
256 |
" subs r2, r2, #0x14 \n" // len -= 20 // len offset: -32
|
|
|
257 |
" blt 4f \n" // IF len < 0: THEN GOTO [remainder31]
|
|
|
258 |
// Save R4 so that we have an additional copying scratchpad register
|
|
|
259 |
" str r4, [sp, #-4]! \n" // SAVE R4 // STACK: R4 orig_dest return_addr
|
|
|
260 |
"1: \n" // DO:
|
|
|
261 |
// Copy 32 bytes at a time
|
|
|
262 |
" ldmia r1!, {r3, r4, r12, lr} \n" // {R3,R4,R12,LR} = *src++ (qword)
|
|
|
263 |
" stmia r0!, {r3, r4, r12, lr} \n" // *dest++ = {R3,R4,R12,LR} (qword)
|
|
|
264 |
" ldmia r1!, {r3, r4, r12, lr} \n" // {R3,R4,R12,LR} = *src++ (qword)
|
|
|
265 |
" stmia r0!, {r3, r4, r12, lr} \n" // *dest++ = {R3,R4,R12,LR} (qword)
|
|
|
266 |
" subs r2, r2, #0x20 \n" // len -= 32
|
|
|
267 |
" bge 1b \n" // WHILE len >= 0
|
|
|
268 |
// Less than 32 bytes remaining, copy 16 if enough are remaining
|
|
|
269 |
" cmn r2, #0x10 \n" // TEST len + 16
|
|
|
270 |
" ittt ge \n" // IF len >= -16:
|
|
|
271 |
" ldmiage r1!, {r3, r4, r12, lr} \n" // THEN {R3,R4,R12,LR} = *src++ (qword)
|
|
|
272 |
" stmiage r0!, {r3, r4, r12, lr} \n" // THEN *dest++ = {R3,R4,R12,LR} (qword)
|
|
|
273 |
" subge r2, r2, #0x10 \n" // THEN len -= 16
|
|
|
274 |
// No need for R4 anymore, restore it so that later code doesn't have to take care of it
|
|
|
275 |
" ldr r4, [sp], #4 \n" // RESTORE R4 // STACK: orig_dest return_addr
|
|
|
276 |
" \n" // (fallthrough)
|
|
|
277 |
" \n" //
|
|
|
278 |
"4: \n" // [remainder31]: we have less than 32 bytes remaining
|
|
|
279 |
// Correct length offset from 32 byte copying mode
|
|
|
280 |
" adds r2, r2, #0x14 \n" // len += 20 // len offset: -12
|
|
|
281 |
"1: \n" // DO:
|
|
|
282 |
// Copy 12 bytes at a time, while enough are remaining
|
|
|
283 |
" itttt ge \n" // IF len >= 0:
|
|
|
284 |
" ldmiage r1!, {r3, r12, lr} \n" // THEN {R3,R12,LR} = *src++ (12 bytes)
|
|
|
285 |
" stmiage r0!, {r3, r12, lr} \n" // THEN *dest++ = {R3,R12,LR} (12 bytes)
|
|
|
286 |
" subsge r2, r2, #0x0c \n" // THEN len -= 12
|
|
|
287 |
" bge 1b \n" // WHILE len >= 12
|
|
|
288 |
" \n" // (fallthrough)
|
|
|
289 |
" \n" //
|
|
|
290 |
"5: \n" // [remainder11]: we have less than 12 bytes remaining
|
|
|
291 |
// Correct length offset from 12 byte copying mode
|
|
|
292 |
" adds r2, r2, #8 \n" // len += 8 // len offset: -4
|
|
|
293 |
// Skip to 3 byte tail if less than 4 bytes are remaining
|
|
|
294 |
" blt 6f \n" // IF len < 0: THEN GOTO [remainder3]
|
|
|
295 |
// We will copy at least 4 bytes, adjust length
|
|
|
296 |
" subs r2, r2, #4 \n" // len -= 4
|
|
|
297 |
// If less than 8 bytes are remaining, copy 4 bytes
|
|
|
298 |
" itt lt \n" // IF len < 0:
|
|
|
299 |
" ldrlt r3, [r1], #4 \n" // THEN R3 = *src++ (word)
|
|
|
300 |
" strlt r3, [r0], #4 \n" // THEN *dest++ = R3 (word)
|
|
|
301 |
// If at least 8 bytes are remaining, copy 8 bytes
|
|
|
302 |
" ittt ge \n" // IF len >= 0:
|
|
|
303 |
" ldmiage r1!, {r3, r12} \n" // THEN {R3,R12} = *src++ (dword)
|
|
|
304 |
" stmiage r0!, {r3, r12} \n" // THEN *dest++ = {R3,R12} (dword)
|
|
|
305 |
// We have subtracted 4 bytes above but copied 8, adjust length
|
|
|
306 |
" subge r2, r2, #4 \n" // THEN len -= 4
|
|
|
307 |
" \n" // (fallthrough)
|
|
|
308 |
" \n" //
|
|
|
309 |
"6: \n" // [remainder3]: we have less than 4 bytes remaining, copy them individually
|
|
|
310 |
// Correct length offset from 4 byte copying mode
|
|
|
311 |
" adds r2, r2, #4 \n" // len += 4 // len offset: 0
|
|
|
312 |
// If we're finished, return
|
|
|
313 |
" it eq \n" // IF len == 0:
|
|
|
314 |
" ldmfdeq sp!, {r0, pc} \n" // THEN RETURN orig_dest
|
|
|
315 |
" cmp r2, #2 \n" // TEST len - 2
|
|
|
316 |
// We always have to copy at least one byte
|
|
|
317 |
" ldrb r3, [r1], #1 \n" // R3 = *src++ (byte)
|
|
|
318 |
" strb r3, [r0], #1 \n" // *dest++ = R3 (byte)
|
|
|
319 |
// If we have to copy at least two, copy another one
|
|
|
320 |
" itt ge \n" // IF len >= 2:
|
|
|
321 |
" ldrbge r3, [r1], #1 \n" // THEN R3 = *src++ (byte)
|
|
|
322 |
" strbge r3, [r0], #1 \n" // THEN *dest++ = R3 (byte)
|
|
|
323 |
// If we have to copy more than two (which is alweays 3), copy another one
|
|
|
324 |
" itt gt \n" // IF len > 2:
|
|
|
325 |
" ldrbgt r3, [r1], #1 \n" // THEN R3 = *src++ (byte)
|
|
|
326 |
" strbgt r3, [r0], #1 \n" // THEN *dest++ = R3 (byte)
|
|
|
327 |
// Everything copied, return
|
|
|
328 |
" ldmfd sp!, {r0, pc} \n" // RETURN orig_dest
|
|
|
329 |
" \n" //
|
|
|
330 |
"7: \n" // [align]: the destination is misaligned, align it
|
|
|
331 |
// Negate the misalignment to figure out how much we have to adjust
|
|
|
332 |
" rsb r12, r12, #4 \n" // misalignment = 4 - misalignment
|
|
|
333 |
" cmp r12, #2 \n" // TEST misalignment - 2
|
|
|
334 |
// We always have to copy at least one byte
|
|
|
335 |
" ldrb r3, [r1], #1 \n" // R3 = *src++ (byte)
|
|
|
336 |
" strb r3, [r0], #1 \n" // *dest++ = R3 (byte)
|
|
|
337 |
// If we have to copy at least two, copy another one
|
|
|
338 |
" itt ge \n" // IF misalignment >= 2:
|
|
|
339 |
" ldrbge r3, [r1], #1 \n" // THEN R3 = *src++ (byte)
|
|
|
340 |
" strbge r3, [r0], #1 \n" // THEN *dest++ = R3 (byte)
|
|
|
341 |
// If we have to copy more than two (which is alweays 3), copy another one
|
|
|
342 |
" itt gt \n" // IF misalignment > 2:
|
|
|
343 |
" ldrbgt r3, [r1], #1 \n" // THEN R3 = *src++ (byte)
|
|
|
344 |
" strbgt r3, [r0], #1 \n" // THEN *dest++ = R3 (byte)
|
|
|
345 |
// The destination is aligned now, check if there are at least 4 bytes remaining
|
|
|
346 |
" subs r2, r2, r12 \n" // len -= misalignment
|
|
|
347 |
// Skip to 3 byte tail if not
|
|
|
348 |
" blt 6b \n" // IF len < 0: THEN GOTO [remainder3]
|
|
|
349 |
// If the source is now misaligned, we have to copy byte by byte
|
|
|
350 |
" tst r1, #3 \n" // TEST src & 3
|
|
|
351 |
// If not, resume fast copying method above
|
|
|
352 |
" beq 3b \n" // IF !(src & 3): GOTO [aligned]
|
|
|
353 |
" \n" // (fallthrough)
|
|
|
354 |
" \n" //
|
|
|
355 |
"8: \n" // [bytewise]: the destination is aligned, but the source isn't, copy byte by byte
|
|
|
356 |
" \n" // DO:
|
|
|
357 |
// Stupidly copy a byte at a time
|
|
|
358 |
" ldrb r3, [r1], #1 \n" // R3 = *src++ (byte)
|
|
|
359 |
" strb r3, [r0], #1 \n" // *dest++ = R3 (byte)
|
|
|
360 |
" subs r2, r2, #1 \n" // len--
|
|
|
361 |
" bge 8b \n" // WHILE len >= 0
|
|
|
362 |
// Less than 4 bytes remaining, use 3 byte tail copying code above
|
|
|
363 |
" b 6b \n" // GOTO [remainder3]
|
|
|
364 |
// End of performance optimized forward implementation
|
|
|
365 |
#endif
|
|
|
366 |
|
|
|
367 |
"9: \n" // [backward]: the destination is above the source, so we need to copy backwards
|
|
|
368 |
// Jump to end of src and dest, and copy backwards
|
|
|
369 |
" add r1, r1, r2 \n" // src += len
|
|
|
370 |
" add r0, r0, r2 \n" // dest += len
|
|
|
371 |
" \n" // (fallthrough)
|
|
|
372 |
|
|
|
373 |
#if defined(SQUEEZE)
|
|
|
374 |
// Absolute minimum size backward implementation:
|
|
|
375 |
"1: \n" // DO:
|
|
|
376 |
// Stupidly copy a byte at a time
|
|
|
377 |
" subs r2, r2, #1 \n" // len--
|
|
|
378 |
" it lt \n" // IF len < 0:
|
|
|
379 |
" ldmfdlt sp!, {r0, pc} \n" // THEN RETURN orig_dest
|
|
|
380 |
" ldrb r3, [r1, #-1]! \n" // R3 = *--src (byte)
|
|
|
381 |
" strb r3, [r0, #-1]! \n" // *--dest = R3 (byte)
|
|
|
382 |
" b 1b \n" // WHILE true
|
|
|
383 |
// End of absolute minimum size backward implementation
|
|
|
384 |
|
|
|
385 |
#elif defined(OPTIMIZE_SIZE)
|
|
|
386 |
// Size optimized backward implementation:
|
|
|
387 |
// Skip to tail if we have to copy less than 4 bytes
|
|
|
388 |
" subs r2, r2, #4 \n" // len -= 4 // len offset: -4
|
|
|
389 |
" blt 2f \n" // IF len < 0: THEN GOTO [remainder]
|
|
|
390 |
// Skip to tail if src or dest are misaligned
|
|
|
391 |
" tst r0, #3 \n" // TEST dest & 3
|
|
|
392 |
" it eq \n" // IF !(dest & 3):
|
|
|
393 |
" tsteq r1, #3 \n" // THEN TEST src & 3
|
|
|
394 |
" bne 2f \n" // IF (dest & 3) OR (src & 3): THEN GOTO [remainder]
|
|
|
395 |
// Copy 4 bytes at a time until less than 4 are remaining
|
|
|
396 |
"1: \n" // DO:
|
|
|
397 |
" ldr r3, [r1, #-4]! \n" // R3 = *--src (word)
|
|
|
398 |
" str r3, [r0, #-4]! \n" // *--dest = R3 (word)
|
|
|
399 |
" subs r2, r2, #4 \n" // len -= 4
|
|
|
400 |
" bge 1b \n" // WHILE: len >= 0
|
|
|
401 |
" \n" // (fallthrough)
|
|
|
402 |
" \n" //
|
|
|
403 |
"2: \n" // [remainder]: copy the remainder byte by byte
|
|
|
404 |
// Correct length offset from 4 byte copying mode
|
|
|
405 |
" adds r2, r2, #4 \n" // len += 4 // len offset: 0
|
|
|
406 |
"1: \n" // DO:
|
|
|
407 |
// Copy the remainder a byte at a time
|
|
|
408 |
" it eq \n" // IF len == 0:
|
|
|
409 |
" ldmfdeq sp!, {r0, pc} \n" // THEN RETURN orig_dest
|
|
|
410 |
" ldrb r3, [r1, #-1]! \n" // R3 = *--src (byte)
|
|
|
411 |
" strb r3, [r0, #-1]! \n" // *--dest = R3 (byte)
|
|
|
412 |
" subs r2, r2, #1 \n" // len--
|
|
|
413 |
" b 1b \n" // WHILE true
|
|
|
414 |
// End of size optimized backward implementation
|
|
|
415 |
|
|
|
416 |
#else
|
|
|
417 |
// Performance optimized backward implementation:
|
|
|
418 |
// Skip to 3 byte tail if we have to copy less than 4 bytes
|
|
|
419 |
" subs r2, r2, #4 \n" // len -= 4 // len offset: -4
|
|
|
420 |
" blt 6f \n" // IF len < 0: THEN GOTO [remainder3]
|
|
|
421 |
// If the destination is misaligned, align it
|
|
|
422 |
// This will return to label 3 if the source is aligned after the destination was aligned
|
|
|
423 |
" ands r12, r0, #3 \n" // misalignment = dest & 3
|
|
|
424 |
" bne 7f \n" // IF misalignment != 0: THEN GOTO [align]
|
|
|
425 |
// The destination was already aligned, but the source isn't
|
|
|
426 |
// We have no option but to fall back to byte by byte copying
|
|
|
427 |
" tst r1, #3 \n" // TEST src & 3
|
|
|
428 |
" bne 8f \n" // IF src & 3: THEN GOTO [bytewise]
|
|
|
429 |
" \n" // (fallthrough)
|
|
|
430 |
" \n" //
|
|
|
431 |
"3: \n" // [aligned]: we are at least 4 byte aligned
|
|
|
432 |
// Skip to 11 byte tail if we have to copy less than 12 bytes
|
|
|
433 |
" subs r2, r2, #8 \n" // len -= 8 // len offset: -12
|
|
|
434 |
" blt 5f \n" // IF len < 0: THEN GOTO [remainder11]
|
|
|
435 |
// Skip to 31 byte tail if we have to copy less than 32 bytes
|
|
|
436 |
" subs r2, r2, #0x14 \n" // len -= 20 // len offset: -32
|
|
|
437 |
" blt 4f \n" // IF len < 0: THEN GOTO [remainder31]
|
|
|
438 |
// Save R4 so that we have an additional copying scratchpad register
|
|
|
439 |
" str r4, [sp, #-4]! \n" // SAVE R4 // STACK: R4 orig_dest return_addr
|
|
|
440 |
"1: \n" // DO:
|
|
|
441 |
// Copy 32 bytes at a time
|
|
|
442 |
" ldmdb r1!, {r3, r4, r12, lr} \n" // {R3,R4,R12,LR} = *--src (qword)
|
|
|
443 |
" stmdb r0!, {r3, r4, r12, lr} \n" // *--dest = {R3,R4,R12,LR} (qword)
|
|
|
444 |
" ldmdb r1!, {r3, r4, r12, lr} \n" // {R3,R4,R12,LR} = *--src (qword)
|
|
|
445 |
" stmdb r0!, {r3, r4, r12, lr} \n" // *--dest = {R3,R4,R12,LR} (qword)
|
|
|
446 |
" subs r2, r2, #0x20 \n" // len -= 32
|
|
|
447 |
" bge 1b \n" // WHILE len >= 0
|
|
|
448 |
// Less than 32 bytes remaining, copy 16 if enough are remaining
|
|
|
449 |
" cmn r2, #0x10 \n" // TEST len + 16
|
|
|
450 |
" ittt ge \n" // IF len >= -16:
|
|
|
451 |
" ldmdbge r1!, {r3, r4, r12, lr} \n" // THEN {R3,R4,R12,LR} = *--src (qword)
|
|
|
452 |
" stmdbge r0!, {r3, r4, r12, lr} \n" // THEN *--dest = {R3,R4,R12,LR} (qword)
|
|
|
453 |
" subge r2, r2, #0x10 \n" // THEN len -= 16
|
|
|
454 |
// No need for R4 anymore, restore it so that later code doesn't have to take care of it
|
|
|
455 |
" ldr r4, [sp], #4 \n" // RESTORE R4 // STACK: orig_dest return_addr
|
|
|
456 |
" \n" // (fallthrough)
|
|
|
457 |
" \n" //
|
|
|
458 |
"4: \n" // [remainder31]: we have less than 32 bytes remaining
|
|
|
459 |
// Correct length offset from 32 byte copying mode
|
|
|
460 |
" adds r2, r2, #0x14 \n" // len += 20 // len offset: -12
|
|
|
461 |
"1: \n" // DO:
|
|
|
462 |
// Copy 12 bytes at a time, while enough are remaining
|
|
|
463 |
" itttt ge \n" // IF len >= 0:
|
|
|
464 |
" ldmdbge r1!, {r3, r12, lr} \n" // THEN {R3,R12,LR} = *--src (12 bytes)
|
|
|
465 |
" stmdbge r0!, {r3, r12, lr} \n" // THEN *--dest = {R3,R12,LR} (12 bytes)
|
|
|
466 |
" subsge r2, r2, #0x0c \n" // THEN len -= 12
|
|
|
467 |
" bge 1b \n" // WHILE len >= 12
|
|
|
468 |
" \n" // (fallthrough)
|
|
|
469 |
" \n" //
|
|
|
470 |
"5: \n" // [remainder11]: we have less than 12 bytes remaining
|
|
|
471 |
// Correct length offset from 12 byte copying mode
|
|
|
472 |
" adds r2, r2, #8 \n" // len += 8 // len offset: -4
|
|
|
473 |
// Skip to 3 byte tail if less than 4 bytes are remaining
|
|
|
474 |
" blt 6f \n" // IF len < 0: THEN GOTO [remainder3]
|
|
|
475 |
// We will copy at least 4 bytes, adjust length
|
|
|
476 |
" subs r2, r2, #4 \n" // len -= 4
|
|
|
477 |
// If less than 8 bytes are remaining, copy 4 bytes
|
|
|
478 |
" itt lt \n" // IF len < 0:
|
|
|
479 |
" ldrlt r3, [r1, #-4]! \n" // THEN R3 = *--src (word)
|
|
|
480 |
" strlt r3, [r0, #-4]! \n" // THEN *--dest = R3 (word)
|
|
|
481 |
// If at least 8 bytes are remaining, copy 8 bytes
|
|
|
482 |
" ittt ge \n" // IF len >= 0:
|
|
|
483 |
" ldmdbge r1!, {r3, r12} \n" // THEN {R3,R12} = *--src (dword)
|
|
|
484 |
" stmdbge r0!, {r3, r12} \n" // THEN *--dest = {R3,R12} (dword)
|
|
|
485 |
// We have subtracted 4 bytes above but copied 8, adjust length
|
|
|
486 |
" subge r2, r2, #4 \n" // THEN len -= 4
|
|
|
487 |
" \n" // (fallthrough)
|
|
|
488 |
" \n" //
|
|
|
489 |
"6: \n" // [remainder3]: we have less than 4 bytes remaining, copy them individually
|
|
|
490 |
// Correct length offset from 4 byte copying mode
|
|
|
491 |
" adds r2, r2, #4 \n" // len += 4 // len offset: 0
|
|
|
492 |
// If we're finished, return
|
|
|
493 |
" it eq \n" // IF len == 0:
|
|
|
494 |
" ldmfdeq sp!, {r0, pc} \n" // THEN RETURN orig_dest
|
|
|
495 |
" cmp r2, #2 \n" // TEST len - 2
|
|
|
496 |
// We always have to copy at least one byte
|
|
|
497 |
" ldrb r3, [r1, #-1]! \n" // R3 = *--src (byte)
|
|
|
498 |
" strb r3, [r0, #-1]! \n" // *--dest = R3 (byte)
|
|
|
499 |
// If we have to copy at least two, copy another one
|
|
|
500 |
" itt ge \n" // IF len >= 2:
|
|
|
501 |
" ldrbge r3, [r1, #-1]! \n" // THEN R3 = *--src (byte)
|
|
|
502 |
" strbge r3, [r0, #-1]! \n" // THEN *--dest = R3 (byte)
|
|
|
503 |
// If we have to copy more than two (which is alweays 3), copy another one
|
|
|
504 |
" itt gt \n" // IF len > 2:
|
|
|
505 |
" ldrbgt r3, [r1, #-1]! \n" // THEN R3 = *--src (byte)
|
|
|
506 |
" strbgt r3, [r0, #-1]! \n" // THEN *--dest = R3 (byte)
|
|
|
507 |
// Everything copied, return
|
|
|
508 |
" ldmfd sp!, {r0, pc} \n" // RETURN orig_dest
|
|
|
509 |
" \n" //
|
|
|
510 |
"7: \n" // [align]: the destination is misaligned, align it
|
|
|
511 |
// No need to negate the misalignment here, we are going backwards so we have to adjust by the misaligned amount
|
|
|
512 |
" cmp r12, #2 \n" // TEST misalignment - 2
|
|
|
513 |
// We always have to copy at least one byte
|
|
|
514 |
" ldrb r3, [r1, #-1]! \n" // R3 = *--src (byte)
|
|
|
515 |
" strb r3, [r0, #-1]! \n" // *--dest = R3 (byte)
|
|
|
516 |
// If we have to copy at least two, copy another one
|
|
|
517 |
" itt ge \n" // IF misalignment >= 2:
|
|
|
518 |
" ldrbge r3, [r1, #-1]! \n" // THEN R3 = *--src (byte)
|
|
|
519 |
" strbge r3, [r0, #-1]! \n" // THEN *--dest = R3 (byte)
|
|
|
520 |
// If we have to copy more than two (which is alweays 3), copy another one
|
|
|
521 |
" itt gt \n" // IF misalignment > 2:
|
|
|
522 |
" ldrbgt r3, [r1, #-1]! \n" // THEN R3 = *--src (byte)
|
|
|
523 |
" strbgt r3, [r0, #-1]! \n" // THEN *--dest = R3 (byte)
|
|
|
524 |
// The destination is aligned now, check if there are at least 4 bytes remaining
|
|
|
525 |
" subs r2, r2, r12 \n" // len -= misalignment
|
|
|
526 |
// Skip to 3 byte tail if not
|
|
|
527 |
" blt 6b \n" // IF len < 0: THEN GOTO [remainder3]
|
|
|
528 |
// If the source is now misaligned, we have to copy byte by byte
|
|
|
529 |
" tst r1, #3 \n" // TEST src & 3
|
|
|
530 |
// If not, resume fast copying method above
|
|
|
531 |
" beq 3b \n" // IF !(src & 3): GOTO [aligned]
|
|
|
532 |
" \n" // (fallthrough)
|
|
|
533 |
" \n" //
|
|
|
534 |
"8: \n" // [bytewise]: the destination is aligned, but the source isn't, copy byte by byte
|
|
|
535 |
" \n" // DO:
|
|
|
536 |
// Stupidly copy a byte at a time
|
|
|
537 |
" ldrb r3, [r1, #-1]! \n" // R3 = *--src (byte)
|
|
|
538 |
" strb r3, [r0, #-1]! \n" // *--dest = R3 (byte)
|
|
|
539 |
" subs r2, r2, #1 \n" // len--
|
|
|
540 |
" bge 8b \n" // WHILE len >= 0
|
|
|
541 |
// Less than 4 bytes remaining, use 3 byte tail copying code above
|
|
|
542 |
" b 6b \n" // GOTO [remainder3]
|
|
|
543 |
// End of performance optimized backward implementation
|
|
|
544 |
#endif
|
|
|
545 |
);
|
|
|
546 |
}
|
|
|
547 |
|
|
|
548 |
__attribute__((alias("memmove"))) void* memcpy(void* dst, const void* src, size_t len);
|
|
|
549 |
|
|
|
550 |
#ifdef CPU_ARM_CORTEX_M
|
|
|
551 |
__attribute__((const)) uint32_t swap32(uint32_t data)
|
|
|
552 |
{
|
|
|
553 |
__asm__("rev %[data], %[data]" : [data] "+r" (data));
|
|
|
554 |
return data;
|
|
|
555 |
}
|
|
|
556 |
|
|
|
557 |
__attribute__((const)) uint32_t swap16(uint32_t data)
|
|
|
558 |
{
|
|
|
559 |
__asm__("rev16 %[data], %[data]" : [data] "+r" (data));
|
|
|
560 |
return data;
|
|
|
561 |
}
|
|
|
562 |
|
|
|
563 |
__attribute__((const)) uint32_t reverse32(uint32_t data)
|
|
|
564 |
{
|
|
|
565 |
__asm__("rbit %[data], %[data]" : [data] "+r" (data));
|
|
|
566 |
return data;
|
|
|
567 |
}
|
|
|
568 |
#endif
|