Subversion Repositories freemyipod

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
881 theseven 1
#include "global.h"
2
#include "sys/util.h"
3
 
4
void enter_critical_section()
5
{
6
#ifdef CPU_ARM_CORTEX
7
    __asm__ volatile("cpsid if");
8
#else
9
    __asm__ volatile(
10
        "mrs r0, cpsr       \n\t"
11
        "orr r0, r0, #0xc0  \n\t"
12
        "msr cpsr_c, r0     \n\t"
13
    ::: "r0");
14
#endif
15
}
16
 
17
void leave_critical_section()
18
{
19
#ifdef CPU_ARM_CORTEX
20
    __asm__ volatile("cpsie if");
21
#else
22
    __asm__ volatile(
23
        "mrs r0, cpsr       \n\t"
24
        "bic r0, r0, #0xc0  \n\t"
25
        "msr cpsr_c, r0     \n\t"
26
    ::: "r0");
27
#endif
28
}
29
 
30
void clean_dcache(const void* addr, uint32_t len)
31
{
32
#ifdef CPU_ARM_OLD
33
    __asm__ volatile(
34
        "mov r0, #0               \n\t"
35
#if defined(CPU_ARM_OLD_V6)
36
        "mcr p15, 0, r0,c7,c10,0  \n\t"
37
#elif defined(CPU_ARM_OLD_V5)
38
        "0:                       \n\t"
39
        "mrc p15, 0, r15,c7,c10,3 \n\t"
40
        "bne 0b                   \n\t"
41
#elif defined(CPU_ARM_OLD_V4)
42
        "0:                       \n\t"
43
        "mcr p15, 0, r0,c7,c10,2  \n\t"
44
        "add r1, r0, #0x10        \n\t"
45
        "mcr p15, 0, r1,c7,c10,2  \n\t"
46
        "add r1, r1, #0x10        \n\t"
47
        "mcr p15, 0, r1,c7,c10,2  \n\t"
48
        "add r1, r1, #0x10        \n\t"
49
        "mcr p15, 0, r1,c7,c10,2  \n\t"
50
        "adds r0, r0, #0x04000000 \n\t"
51
        "bne 0b                   \n\t"
52
#else
53
#error Please implement invalidate_dcache for this ARM core generation!
54
#endif
55
        "mcr p15, 0, r0,c7,c10,4 \n\t"
56
    ::: "r0", "r1");
57
#endif
58
}
59
 
60
void invalidate_dcache(const void* addr, uint32_t len)
61
{
62
#ifdef CPU_ARM_OLD
63
    __asm__ volatile(
64
        "mov r0, #0               \n\t"
65
#if defined(CPU_ARM_OLD_V6)
66
        "mcr p15, 0, r0,c7,c14,0  \n\t"
67
#elif defined(CPU_ARM_OLD_V5)
68
        "0:                       \n\t"
69
        "mrc p15, 0, r15,c7,c14,3 \n\t"
70
        "bne 0b                   \n\t"
71
#elif defined(CPU_ARM_OLD_V4)
72
        "0:                       \n\t"
73
        "mcr p15, 0, r0,c7,c14,2  \n\t"
74
        "add r1, r0, #0x10        \n\t"
75
        "mcr p15, 0, r1,c7,c14,2  \n\t"
76
        "add r1, r1, #0x10        \n\t"
77
        "mcr p15, 0, r1,c7,c14,2  \n\t"
78
        "add r1, r1, #0x10        \n\t"
79
        "mcr p15, 0, r1,c7,c14,2  \n\t"
80
        "adds r0, r0, #0x04000000 \n\t"
81
        "bne 0b                   \n\t"
82
#else
83
#error Please implement invalidate_dcache for this ARM core generation!
84
#endif
85
        "mcr p15, 0, r0,c7,c10,4 \n\t"
86
    ::: "r0", "r1");
87
#endif
88
}
89
 
90
void invalidate_icache(const void* addr, uint32_t len)
91
{
92
#ifdef CPU_ARM_OLD
93
    __asm__ volatile(
94
        "mov r0, #0              \n\t"
95
        "mcr p15, 0, r0,c7,c5,0  \n\t"
96
#ifdef CPU_ARM_OLD_V6
97
        "mcr p15, 0, r0,c7,c5,4  \n\t"
98
#endif
99
    ::: "r0");
100
#endif
101
}
102
 
103
void enable_mmu()
104
{
105
#ifdef CPU_ARM_OLD
106
    __asm__ volatile(
107
        "mrc p15, 0, r0,c1,c0    \n\t"
108
        "orr r1, r1, #0x00001000 \n\t"
109
        "orr r1, r1, #0x00000005 \n\t"
110
        "mcr p15, 0, r0,c1,c0    \n\t"
111
    ::: "r0");
112
#endif
113
}
114
 
115
void disable_mmu()
116
{
117
#ifdef CPU_ARM_OLD
118
    __asm__ volatile(
119
        "mrc p15, 0, r0,c1,c0    \n\t"
120
        "bic r1, r1, #0x00001000 \n\t"
121
        "bic r1, r1, #0x00000005 \n\t"
122
        "mcr p15, 0, r0,c1,c0    \n\t"
123
    ::: "r0");
124
#endif
125
}
126
 
127
__attribute__((naked, noinline)) void* memset(void* dst_void, int val, size_t len)
128
{
129
    __asm__ volatile(
130
        ".syntax unified             \n"
131
        "    mov r3, r0              \n"
132
#ifndef SQUEEZE
133
        "    cmp r2, #8              \n"
134
        "    blt 2f                  \n"
135
        "    orr r1, r1, r1, lsl #8  \n"
136
        "    orr r1, r1, r1, lsl #16 \n"
137
        "1:                          \n"
138
        "    tst r3, #3              \n"
139
        "    itt ne                  \n"
140
        "    strbne r1, [r3], #1     \n"
141
        "    subne r2, r2, #1        \n"
142
        "    bne 1b                  \n"
143
        "    mov r12, r1             \n"
144
        "1:                          \n"
145
        "    cmp r2, #8              \n"
146
        "    blt 2f                  \n"
147
        "    stmia r3!, {r1, r12}    \n"
148
        "    sub r2, r2, #8          \n"
149
        "    cmp r2, #8              \n"
150
        "    blt 2f                  \n"
151
        "    stmia r3!, {r1, r12}    \n"
152
        "    sub r2, r2, #8          \n"
153
        "    cmp r2, #8              \n"
154
        "    blt 2f                  \n"
155
        "    stmia r3!, {r1, r12}    \n"
156
        "    sub r2, r2, #8          \n"
157
        "    cmp r2, #8              \n"
158
        "    itt ge                  \n"
159
        "    stmiage r3!, {r1, r12}  \n"
160
        "    subge r2, r2, #8        \n"
161
        "    bge 1b                  \n"
162
        "2:                          \n"
163
#endif
164
        "    movs r2, r2             \n"
165
        "    it eq                   \n"
166
        "    bxeq lr                 \n"
167
        "1:                          \n"
168
        "    strb r1, [r3], #1       \n"
169
        "    subs r2, r2, #1         \n"
170
        "    bne 1b                  \n"
171
        "    bx lr                   \n"
172
    );
173
}
174
 
175
__attribute__((naked, noinline)) void* memmove(void* dst, const void* src, size_t len)
176
{
177
    __asm__ volatile(
178
        ".syntax unified                     \n"
179
        // Copy memory forward or backward, allows overlapping src/dest
180
        // args: R0: dest, R1: src, R2: len; returns: dest
181
        "    cmp r1, r0                      \n"  // TEST src - dst
182
        // If src == dst we don't need to do anything at all
183
        "    it eq                           \n"  // IF src == dest:
184
        "    bxeq lr                         \n"  //     THEN return dest
185
        "    stmfd sp!, {r0, lr}             \n"  // SAVE return_addr orig_dest                         // STACK: orig_dest return_addr
186
        // Check if we need to go backwards (if source < destination and there is overlap)
187
        "    itt cc                          \n"  // IF src < dest:
188
        "    addcc r3, r1, r2                \n"  //     THEN src_end = src + len
189
        "    cmpcc r0, r3                    \n"  //     THEN TEST dest - src_end
190
        "    bcc 9f                          \n"  // IF (src < dest) AND (dest < src_end): THEN GOTO [backward]
191
        "                                    \n"  // (fallthrough)
192
 
193
#if defined(SQUEEZE)
194
        // Absolute minimum size forward implementation:
195
        "1:                                  \n"  // DO:
196
        // Stupidly copy a byte at a time
197
        "    subs r2, r2, #1                 \n"  //     len--
198
        "    it lt                           \n"  //     IF len < 0:
199
        "    ldmfdlt sp!, {r0, pc}           \n"  //         THEN RETURN orig_dest
200
        "    ldrb r3, [r1], #1               \n"  //     R3 = *src++ (byte)
201
        "    strb r3, [r0], #1               \n"  //     *dest++ = R3 (byte)
202
        "    b 1b                            \n"  // WHILE true
203
        // End of absolute minimum size forward implementation
204
 
205
#elif defined(OPTIMIZE_SIZE)
206
        // Size optimized forward implementation:
207
        // Skip to tail if we have to copy less than 4 bytes
208
        "    subs r2, r2, #4                 \n"  // len -= 4                                           // len offset: -4
209
        "    blt 2f                          \n"  // IF len < 0: THEN GOTO [remainder]
210
        // Skip to tail if src or dest are misaligned
211
        "    tst r0, #3                      \n"  // TEST dest & 3
212
        "    it eq                           \n"  // IF !(dest & 3):
213
        "    tsteq r1, #3                    \n"  //     THEN TEST src & 3
214
        "    bne 2f                          \n"  // IF (dest & 3) OR (src & 3): THEN GOTO [remainder]
215
        // Copy 4 bytes at a time until less than 4 are remaining
216
        "1:                                  \n"  // DO:
217
        "    ldr r3, [r1], #4                \n"  //     R3 = *src++ (word)
218
        "    str r3, [r0], #4                \n"  //     *dest++ = R3 (word)
219
        "    subs r2, r2, #4                 \n"  //     len -= 4
220
        "    bge 1b                          \n"  // WHILE: len >= 0
221
        "                                    \n"  // (fallthrough)
222
        "                                    \n"  //
223
        "2:                                  \n"  // [remainder]: copy the remainder byte by byte
224
        // Correct length offset from 4 byte copying mode
225
        "    adds r2, r2, #4                 \n"  // len += 4                                           // len offset: 0
226
        "1:                                  \n"  // DO:
227
        // Copy the remainder a byte at a time
228
        "    it eq                           \n"  //     IF len == 0:
229
        "    ldmfdeq sp!, {r0, pc}           \n"  //         THEN RETURN orig_dest
230
        "    ldrb r3, [r1], #1               \n"  //     R3 = *src++ (byte)
231
        "    strb r3, [r0], #1               \n"  //     *dest++ = R3 (byte)
232
        "    subs r2, r2, #1                 \n"  //     len--
233
        "    b 1b                            \n"  // WHILE true
234
        // End of size optimized forward implementation
235
#else
236
 
237
        // Performance optimized forward implementation:
238
        // Skip to 3 byte tail if we have to copy less than 4 bytes
239
        "    subs r2, r2, #4                 \n"  // len -= 4                                           // len offset: -4
240
        "    blt 6f                          \n"  // IF len < 0: THEN GOTO [remainder3]
241
        // If the destination is misaligned, align it
242
        // This will return to label 3 if the source is aligned after the destination was aligned
243
        "    ands r12, r0, #3                \n"  // misalignment = dest & 3
244
        "    bne 7f                          \n"  // IF misalignment != 0: THEN GOTO [align]
245
        // The destination was already aligned, but the source isn't
246
        // We have no option but to fall back to byte by byte copying
247
        "    tst r1, #3                      \n"  // TEST src & 3
248
        "    bne 8f                          \n"  // IF src & 3: THEN GOTO [bytewise]
249
        "                                    \n"  // (fallthrough)
250
        "                                    \n"  //
251
        "3:                                  \n"  // [aligned]: we are at least 4 byte aligned
252
        // Skip to 11 byte tail if we have to copy less than 12 bytes
253
        "    subs r2, r2, #8                 \n"  // len -= 8                                           // len offset: -12
254
        "    blt 5f                          \n"  // IF len < 0: THEN GOTO [remainder11]
255
        // Skip to 31 byte tail if we have to copy less than 32 bytes
256
        "    subs r2, r2, #0x14              \n"  // len -= 20                                          // len offset: -32
257
        "    blt 4f                          \n"  // IF len < 0: THEN GOTO [remainder31]
258
        // Save R4 so that we have an additional copying scratchpad register
259
        "    str r4, [sp, #-4]!              \n"  // SAVE R4                                            // STACK: R4 orig_dest return_addr
260
        "1:                                  \n"  // DO:
261
        // Copy 32 bytes at a time
262
        "    ldmia r1!, {r3, r4, r12, lr}    \n"  //     {R3,R4,R12,LR} = *src++ (qword)
263
        "    stmia r0!, {r3, r4, r12, lr}    \n"  //     *dest++ = {R3,R4,R12,LR} (qword)
264
        "    ldmia r1!, {r3, r4, r12, lr}    \n"  //     {R3,R4,R12,LR} = *src++ (qword)
265
        "    stmia r0!, {r3, r4, r12, lr}    \n"  //     *dest++ = {R3,R4,R12,LR} (qword)
266
        "    subs r2, r2, #0x20              \n"  //     len -= 32
267
        "    bge 1b                          \n"  // WHILE len >= 0
268
        // Less than 32 bytes remaining, copy 16 if enough are remaining
269
        "    cmn r2, #0x10                   \n"  // TEST len + 16
270
        "    ittt ge                         \n"  // IF len >= -16:
271
        "    ldmiage r1!, {r3, r4, r12, lr}  \n"  //     THEN {R3,R4,R12,LR} = *src++ (qword)
272
        "    stmiage r0!, {r3, r4, r12, lr}  \n"  //     THEN *dest++ = {R3,R4,R12,LR} (qword)
273
        "    subge r2, r2, #0x10             \n"  //     THEN len -= 16
274
        // No need for R4 anymore, restore it so that later code doesn't have to take care of it
275
        "    ldr r4, [sp], #4                \n"  // RESTORE R4                                         // STACK: orig_dest return_addr
276
        "                                    \n"  // (fallthrough)
277
        "                                    \n"  //
278
        "4:                                  \n"  // [remainder31]: we have less than 32 bytes remaining
279
        // Correct length offset from 32 byte copying mode
280
        "    adds r2, r2, #0x14              \n"  // len += 20                                          // len offset: -12
281
        "1:                                  \n"  // DO:
282
        // Copy 12 bytes at a time, while enough are remaining
283
        "    itttt ge                        \n"  //     IF len >= 0:
284
        "    ldmiage r1!, {r3, r12, lr}      \n"  //         THEN {R3,R12,LR} = *src++ (12 bytes)
285
        "    stmiage r0!, {r3, r12, lr}      \n"  //         THEN *dest++ = {R3,R12,LR} (12 bytes)
286
        "    subsge r2, r2, #0x0c            \n"  //         THEN len -= 12
287
        "    bge 1b                          \n"  // WHILE len >= 12
288
        "                                    \n"  // (fallthrough)
289
        "                                    \n"  //
290
        "5:                                  \n"  // [remainder11]: we have less than 12 bytes remaining
291
        // Correct length offset from 12 byte copying mode
292
        "    adds r2, r2, #8                 \n"  // len += 8                                           // len offset: -4
293
        // Skip to 3 byte tail if less than 4 bytes are remaining
294
        "    blt 6f                          \n"  // IF len < 0: THEN GOTO [remainder3]
295
        // We will copy at least 4 bytes, adjust length
296
        "    subs r2, r2, #4                 \n"  // len -= 4
297
        // If less than 8 bytes are remaining, copy 4 bytes
298
        "    itt lt                          \n"  // IF len < 0:
299
        "    ldrlt r3, [r1], #4              \n"  //     THEN R3 = *src++ (word)
300
        "    strlt r3, [r0], #4              \n"  //     THEN *dest++ = R3 (word)
301
        // If at least 8 bytes are remaining, copy 8 bytes
302
        "    ittt ge                         \n"  // IF len >= 0:
303
        "    ldmiage r1!, {r3, r12}          \n"  //     THEN {R3,R12} = *src++ (dword)
304
        "    stmiage r0!, {r3, r12}          \n"  //     THEN *dest++ = {R3,R12} (dword)
305
        // We have subtracted 4 bytes above but copied 8, adjust length
306
        "    subge r2, r2, #4                \n"  //     THEN len -= 4
307
        "                                    \n"  // (fallthrough)
308
        "                                    \n"  //
309
        "6:                                  \n"  // [remainder3]: we have less than 4 bytes remaining, copy them individually
310
        // Correct length offset from 4 byte copying mode
311
        "    adds r2, r2, #4                 \n"  // len += 4                                           // len offset: 0
312
        // If we're finished, return
313
        "    it eq                           \n"  // IF len == 0:
314
        "    ldmfdeq sp!, {r0, pc}           \n"  //     THEN RETURN orig_dest
315
        "    cmp r2, #2                      \n"  // TEST len - 2
316
        // We always have to copy at least one byte
317
        "    ldrb r3, [r1], #1               \n"  // R3 = *src++ (byte)
318
        "    strb r3, [r0], #1               \n"  // *dest++ = R3 (byte)
319
        // If we have to copy at least two, copy another one
320
        "    itt ge                          \n"  // IF len >= 2:
321
        "    ldrbge r3, [r1], #1             \n"  //     THEN R3 = *src++ (byte)
322
        "    strbge r3, [r0], #1             \n"  //     THEN *dest++ = R3 (byte)
323
        // If we have to copy more than two (which is alweays 3), copy another one
324
        "    itt gt                          \n"  // IF len > 2:
325
        "    ldrbgt r3, [r1], #1             \n"  //     THEN R3 = *src++ (byte)
326
        "    strbgt r3, [r0], #1             \n"  //     THEN *dest++ = R3 (byte)
327
        // Everything copied, return
328
        "    ldmfd sp!, {r0, pc}             \n"  // RETURN orig_dest
329
        "                                    \n"  //
330
        "7:                                  \n"  // [align]: the destination is misaligned, align it
331
        // Negate the misalignment to figure out how much we have to adjust
332
        "    rsb r12, r12, #4                \n"  // misalignment = 4 - misalignment
333
        "    cmp r12, #2                     \n"  // TEST misalignment - 2
334
        // We always have to copy at least one byte
335
        "    ldrb r3, [r1], #1               \n"  // R3 = *src++ (byte)
336
        "    strb r3, [r0], #1               \n"  // *dest++ = R3 (byte)
337
        // If we have to copy at least two, copy another one
338
        "    itt ge                          \n"  // IF misalignment >= 2:
339
        "    ldrbge r3, [r1], #1             \n"  //     THEN R3 = *src++ (byte)
340
        "    strbge r3, [r0], #1             \n"  //     THEN *dest++ = R3 (byte)
341
        // If we have to copy more than two (which is alweays 3), copy another one
342
        "    itt gt                          \n"  // IF misalignment > 2:
343
        "    ldrbgt r3, [r1], #1             \n"  //     THEN R3 = *src++ (byte)
344
        "    strbgt r3, [r0], #1             \n"  //     THEN *dest++ = R3 (byte)
345
        // The destination is aligned now, check if there are at least 4 bytes remaining
346
        "    subs r2, r2, r12                \n"  // len -= misalignment
347
        // Skip to 3 byte tail if not
348
        "    blt 6b                          \n"  // IF len < 0: THEN GOTO [remainder3]
349
        // If the source is now misaligned, we have to copy byte by byte
350
        "    tst r1, #3                      \n"  // TEST src & 3
351
        // If not, resume fast copying method above
352
        "    beq 3b                          \n"  // IF !(src & 3): GOTO [aligned]
353
        "                                    \n"  // (fallthrough)
354
        "                                    \n"  //
355
        "8:                                  \n"  // [bytewise]: the destination is aligned, but the source isn't, copy byte by byte
356
        "                                    \n"  // DO:
357
        // Stupidly copy a byte at a time
358
        "    ldrb r3, [r1], #1               \n"  //     R3 = *src++ (byte)
359
        "    strb r3, [r0], #1               \n"  //     *dest++ = R3 (byte)
360
        "    subs r2, r2, #1                 \n"  //     len--
361
        "    bge 8b                          \n"  // WHILE len >= 0
362
        // Less than 4 bytes remaining, use 3 byte tail copying code above
363
        "    b 6b                            \n"  // GOTO [remainder3]
364
        // End of performance optimized forward implementation
365
#endif
366
 
367
        "9:                                  \n"  // [backward]: the destination is above the source, so we need to copy backwards
368
        // Jump to end of src and dest, and copy backwards
369
        "    add r1, r1, r2                  \n"  // src += len
370
        "    add r0, r0, r2                  \n"  // dest += len
371
        "                                    \n"  // (fallthrough)
372
 
373
#if defined(SQUEEZE)
374
        // Absolute minimum size backward implementation:
375
        "1:                                  \n"  // DO:
376
        // Stupidly copy a byte at a time
377
        "    subs r2, r2, #1                 \n"  //     len--
378
        "    it lt                           \n"  //     IF len < 0:
379
        "    ldmfdlt sp!, {r0, pc}           \n"  //         THEN RETURN orig_dest
380
        "    ldrb r3, [r1, #-1]!             \n"  //     R3 = *--src (byte)
381
        "    strb r3, [r0, #-1]!             \n"  //     *--dest = R3 (byte)
382
        "    b 1b                            \n"  // WHILE true
383
        // End of absolute minimum size backward implementation
384
 
385
#elif defined(OPTIMIZE_SIZE)
386
        // Size optimized backward implementation:
387
        // Skip to tail if we have to copy less than 4 bytes
388
        "    subs r2, r2, #4                 \n"  // len -= 4                                           // len offset: -4
389
        "    blt 2f                          \n"  // IF len < 0: THEN GOTO [remainder]
390
        // Skip to tail if src or dest are misaligned
391
        "    tst r0, #3                      \n"  // TEST dest & 3
392
        "    it eq                           \n"  // IF !(dest & 3):
393
        "    tsteq r1, #3                    \n"  //     THEN TEST src & 3
394
        "    bne 2f                          \n"  // IF (dest & 3) OR (src & 3): THEN GOTO [remainder]
395
        // Copy 4 bytes at a time until less than 4 are remaining
396
        "1:                                  \n"  // DO:
397
        "    ldr r3, [r1, #-4]!              \n"  //     R3 = *--src (word)
398
        "    str r3, [r0, #-4]!              \n"  //     *--dest = R3 (word)
399
        "    subs r2, r2, #4                 \n"  //     len -= 4
400
        "    bge 1b                          \n"  // WHILE: len >= 0
401
        "                                    \n"  // (fallthrough)
402
        "                                    \n"  //
403
        "2:                                  \n"  // [remainder]: copy the remainder byte by byte
404
        // Correct length offset from 4 byte copying mode
405
        "    adds r2, r2, #4                 \n"  // len += 4                                           // len offset: 0
406
        "1:                                  \n"  // DO:
407
        // Copy the remainder a byte at a time
408
        "    it eq                           \n"  //     IF len == 0:
409
        "    ldmfdeq sp!, {r0, pc}           \n"  //         THEN RETURN orig_dest
410
        "    ldrb r3, [r1, #-1]!             \n"  //     R3 = *--src (byte)
411
        "    strb r3, [r0, #-1]!             \n"  //     *--dest = R3 (byte)
412
        "    subs r2, r2, #1                 \n"  //     len--
413
        "    b 1b                            \n"  // WHILE true
414
        // End of size optimized backward implementation
415
 
416
#else
417
        // Performance optimized backward implementation:
418
        // Skip to 3 byte tail if we have to copy less than 4 bytes
419
        "    subs r2, r2, #4                 \n"  // len -= 4                                           // len offset: -4
420
        "    blt 6f                          \n"  // IF len < 0: THEN GOTO [remainder3]
421
        // If the destination is misaligned, align it
422
        // This will return to label 3 if the source is aligned after the destination was aligned
423
        "    ands r12, r0, #3                \n"  // misalignment = dest & 3
424
        "    bne 7f                          \n"  // IF misalignment != 0: THEN GOTO [align]
425
        // The destination was already aligned, but the source isn't
426
        // We have no option but to fall back to byte by byte copying
427
        "    tst r1, #3                      \n"  // TEST src & 3
428
        "    bne 8f                          \n"  // IF src & 3: THEN GOTO [bytewise]
429
        "                                    \n"  // (fallthrough)
430
        "                                    \n"  //
431
        "3:                                  \n"  // [aligned]: we are at least 4 byte aligned
432
        // Skip to 11 byte tail if we have to copy less than 12 bytes
433
        "    subs r2, r2, #8                 \n"  // len -= 8                                           // len offset: -12
434
        "    blt 5f                          \n"  // IF len < 0: THEN GOTO [remainder11]
435
        // Skip to 31 byte tail if we have to copy less than 32 bytes
436
        "    subs r2, r2, #0x14              \n"  // len -= 20                                          // len offset: -32
437
        "    blt 4f                          \n"  // IF len < 0: THEN GOTO [remainder31]
438
        // Save R4 so that we have an additional copying scratchpad register
439
        "    str r4, [sp, #-4]!              \n"  // SAVE R4                                            // STACK: R4 orig_dest return_addr
440
        "1:                                  \n"  // DO:
441
        // Copy 32 bytes at a time
442
        "    ldmdb r1!, {r3, r4, r12, lr}    \n"  //     {R3,R4,R12,LR} = *--src (qword)
443
        "    stmdb r0!, {r3, r4, r12, lr}    \n"  //     *--dest = {R3,R4,R12,LR} (qword)
444
        "    ldmdb r1!, {r3, r4, r12, lr}    \n"  //     {R3,R4,R12,LR} = *--src (qword)
445
        "    stmdb r0!, {r3, r4, r12, lr}    \n"  //     *--dest = {R3,R4,R12,LR} (qword)
446
        "    subs r2, r2, #0x20              \n"  //     len -= 32
447
        "    bge 1b                          \n"  // WHILE len >= 0
448
        // Less than 32 bytes remaining, copy 16 if enough are remaining
449
        "    cmn r2, #0x10                   \n"  // TEST len + 16
450
        "    ittt ge                         \n"  // IF len >= -16:
451
        "    ldmdbge r1!, {r3, r4, r12, lr}  \n"  //     THEN {R3,R4,R12,LR} = *--src (qword)
452
        "    stmdbge r0!, {r3, r4, r12, lr}  \n"  //     THEN *--dest = {R3,R4,R12,LR} (qword)
453
        "    subge r2, r2, #0x10             \n"  //     THEN len -= 16
454
        // No need for R4 anymore, restore it so that later code doesn't have to take care of it
455
        "    ldr r4, [sp], #4                \n"  // RESTORE R4                                         // STACK: orig_dest return_addr
456
        "                                    \n"  // (fallthrough)
457
        "                                    \n"  //
458
        "4:                                  \n"  // [remainder31]: we have less than 32 bytes remaining
459
        // Correct length offset from 32 byte copying mode
460
        "    adds r2, r2, #0x14              \n"  // len += 20                                          // len offset: -12
461
        "1:                                  \n"  // DO:
462
        // Copy 12 bytes at a time, while enough are remaining
463
        "    itttt ge                        \n"  //     IF len >= 0:
464
        "    ldmdbge r1!, {r3, r12, lr}      \n"  //         THEN {R3,R12,LR} = *--src (12 bytes)
465
        "    stmdbge r0!, {r3, r12, lr}      \n"  //         THEN *--dest = {R3,R12,LR} (12 bytes)
466
        "    subsge r2, r2, #0x0c            \n"  //         THEN len -= 12
467
        "    bge 1b                          \n"  // WHILE len >= 12
468
        "                                    \n"  // (fallthrough)
469
        "                                    \n"  //
470
        "5:                                  \n"  // [remainder11]: we have less than 12 bytes remaining
471
        // Correct length offset from 12 byte copying mode
472
        "    adds r2, r2, #8                 \n"  // len += 8                                           // len offset: -4
473
        // Skip to 3 byte tail if less than 4 bytes are remaining
474
        "    blt 6f                          \n"  // IF len < 0: THEN GOTO [remainder3]
475
        // We will copy at least 4 bytes, adjust length
476
        "    subs r2, r2, #4                 \n"  // len -= 4
477
        // If less than 8 bytes are remaining, copy 4 bytes
478
        "    itt lt                          \n"  // IF len < 0:
479
        "    ldrlt r3, [r1, #-4]!            \n"  //     THEN R3 = *--src (word)
480
        "    strlt r3, [r0, #-4]!            \n"  //     THEN *--dest = R3 (word)
481
        // If at least 8 bytes are remaining, copy 8 bytes
482
        "    ittt ge                         \n"  // IF len >= 0:
483
        "    ldmdbge r1!, {r3, r12}          \n"  //     THEN {R3,R12} = *--src (dword)
484
        "    stmdbge r0!, {r3, r12}          \n"  //     THEN *--dest = {R3,R12} (dword)
485
        // We have subtracted 4 bytes above but copied 8, adjust length
486
        "    subge r2, r2, #4                \n"  //     THEN len -= 4
487
        "                                    \n"  // (fallthrough)
488
        "                                    \n"  //
489
        "6:                                  \n"  // [remainder3]: we have less than 4 bytes remaining, copy them individually
490
        // Correct length offset from 4 byte copying mode
491
        "    adds r2, r2, #4                 \n"  // len += 4                                           // len offset: 0
492
        // If we're finished, return
493
        "    it eq                           \n"  // IF len == 0:
494
        "    ldmfdeq sp!, {r0, pc}           \n"  //     THEN RETURN orig_dest
495
        "    cmp r2, #2                      \n"  // TEST len - 2
496
        // We always have to copy at least one byte
497
        "    ldrb r3, [r1, #-1]!             \n"  // R3 = *--src (byte)
498
        "    strb r3, [r0, #-1]!             \n"  // *--dest = R3 (byte)
499
        // If we have to copy at least two, copy another one
500
        "    itt ge                          \n"  // IF len >= 2:
501
        "    ldrbge r3, [r1, #-1]!           \n"  //     THEN R3 = *--src (byte)
502
        "    strbge r3, [r0, #-1]!           \n"  //     THEN *--dest = R3 (byte)
503
        // If we have to copy more than two (which is alweays 3), copy another one
504
        "    itt gt                          \n"  // IF len > 2:
505
        "    ldrbgt r3, [r1, #-1]!           \n"  //     THEN R3 = *--src (byte)
506
        "    strbgt r3, [r0, #-1]!           \n"  //     THEN *--dest = R3 (byte)
507
        // Everything copied, return
508
        "    ldmfd sp!, {r0, pc}             \n"  // RETURN orig_dest
509
        "                                    \n"  //
510
        "7:                                  \n"  // [align]: the destination is misaligned, align it
511
        // No need to negate the misalignment here, we are going backwards so we have to adjust by the misaligned amount
512
        "    cmp r12, #2                     \n"  // TEST misalignment - 2
513
        // We always have to copy at least one byte
514
        "    ldrb r3, [r1, #-1]!             \n"  // R3 = *--src (byte)
515
        "    strb r3, [r0, #-1]!             \n"  // *--dest = R3 (byte)
516
        // If we have to copy at least two, copy another one
517
        "    itt ge                          \n"  // IF misalignment >= 2:
518
        "    ldrbge r3, [r1, #-1]!           \n"  //     THEN R3 = *--src (byte)
519
        "    strbge r3, [r0, #-1]!           \n"  //     THEN *--dest = R3 (byte)
520
        // If we have to copy more than two (which is alweays 3), copy another one
521
        "    itt gt                          \n"  // IF misalignment > 2:
522
        "    ldrbgt r3, [r1, #-1]!           \n"  //     THEN R3 = *--src (byte)
523
        "    strbgt r3, [r0, #-1]!           \n"  //     THEN *--dest = R3 (byte)
524
        // The destination is aligned now, check if there are at least 4 bytes remaining
525
        "    subs r2, r2, r12                \n"  // len -= misalignment
526
        // Skip to 3 byte tail if not
527
        "    blt 6b                          \n"  // IF len < 0: THEN GOTO [remainder3]
528
        // If the source is now misaligned, we have to copy byte by byte
529
        "    tst r1, #3                      \n"  // TEST src & 3
530
        // If not, resume fast copying method above
531
        "    beq 3b                          \n"  // IF !(src & 3): GOTO [aligned]
532
        "                                    \n"  // (fallthrough)
533
        "                                    \n"  //
534
        "8:                                  \n"  // [bytewise]: the destination is aligned, but the source isn't, copy byte by byte
535
        "                                    \n"  // DO:
536
        // Stupidly copy a byte at a time
537
        "    ldrb r3, [r1, #-1]!             \n"  //     R3 = *--src (byte)
538
        "    strb r3, [r0, #-1]!             \n"  //     *--dest = R3 (byte)
539
        "    subs r2, r2, #1                 \n"  //     len--
540
        "    bge 8b                          \n"  // WHILE len >= 0
541
        // Less than 4 bytes remaining, use 3 byte tail copying code above
542
        "    b 6b                            \n"  // GOTO [remainder3]
543
        // End of performance optimized backward implementation
544
#endif
545
    );
546
}
547
 
548
__attribute__((alias("memmove"))) void* memcpy(void* dst, const void* src, size_t len);
549
 
550
#ifdef CPU_ARM_CORTEX_M
551
__attribute__((const)) uint32_t swap32(uint32_t data)
552
{
553
    __asm__("rev %[data], %[data]" : [data] "+r" (data));
554
    return data;
555
}
556
 
557
__attribute__((const)) uint32_t swap16(uint32_t data)
558
{
559
    __asm__("rev16 %[data], %[data]" : [data] "+r" (data));
560
    return data;
561
}
562
 
563
__attribute__((const)) uint32_t reverse32(uint32_t data)
564
{
565
    __asm__("rbit %[data], %[data]" : [data] "+r" (data));
566
    return data;
567
}
568
#endif