summaryrefslogtreecommitdiffstats
path: root/libc/arch-arm/bionic/memcpy.S
blob: e92ff5e6be57e7863ade8e29ac6b7d6c20ef0d20 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
/*
 * Copyright (C) 2008 The Android Open Source Project
 * All rights reserved.
 *
 * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <machine/cpu-features.h>

#if defined(__ARM_NEON__)
#if defined(SCORPION_NEON_OPTIMIZATION)
	/*
	 * These can be overridden in:
	 *   device/<vendor>/<board>/BoardConfig.mk
         * by setting the following:
	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
	 *   TARGET_USE_SCORPION_PLD_SET := true
	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
	 */
#ifndef PLDOFFS
#define PLDOFFS	(6)
#endif
#ifndef PLDSIZE
#define PLDSIZE	(128)	/* L2 cache line size */
#endif
        .code 32
        .align 5
        .globl memcpy
        .func
memcpy:
	push            {r0}
	cmp             r2, #4
	blt             .Lneon_lt4
	cmp             r2, #16
	blt             .Lneon_lt16
	cmp             r2, #32
	blt             .Lneon_16
	cmp              r2, #128
	blt              .Lneon_copy_32_a
	/* Copy blocks of 128-bytes (word-aligned) at a time*/
	/* Code below is optimized for PLDSIZE=128 only */
	mov             r12, r2, lsr #7
	cmp             r12, #PLDOFFS
	ble             .Lneon_copy_128_loop_nopld
	sub             r12, #PLDOFFS
	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
.Lneon_copy_128_loop_outer:
	pld             [r1, #(PLDOFFS*PLDSIZE)]
	vld1.32         {q0, q1}, [r1]!
	vld1.32         {q2, q3}, [r1]!
	vld1.32         {q8, q9}, [r1]!
	vld1.32         {q10, q11}, [r1]!
	subs            r12, r12, #1
	vst1.32	        {q0, q1}, [r0]!
	vst1.32         {q2, q3}, [r0]!
	vst1.32         {q8, q9}, [r0]!
	vst1.32         {q10, q11}, [r0]!
	bne             .Lneon_copy_128_loop_outer
	mov             r12, #PLDOFFS
.Lneon_copy_128_loop_nopld:
	vld1.32         {q0, q1}, [r1]!
	vld1.32         {q2, q3}, [r1]!
	vld1.32         {q8, q9}, [r1]!
	vld1.32         {q10, q11}, [r1]!
	subs            r12, r12, #1
	vst1.32         {q0, q1}, [r0]!
	vst1.32         {q2, q3}, [r0]!
	vst1.32         {q8, q9}, [r0]!
	vst1.32         {q10, q11}, [r0]!
	bne             .Lneon_copy_128_loop_nopld
	ands            r2, r2, #0x7f
	beq             .Lneon_exit
	cmp             r2, #32
	blt             .Lneon_16
	nop
	/* Copy blocks of 32-bytes (word aligned) at a time*/
.Lneon_copy_32_a:
	mov             r12, r2, lsr #5
.Lneon_copy_32_loop_a:
	vld1.32         {q0,q1}, [r1]!
	subs            r12, r12, #1
	vst1.32         {q0,q1}, [r0]!
	bne             .Lneon_copy_32_loop_a
	ands            r2, r2, #0x1f
	beq             .Lneon_exit
.Lneon_16:
	subs            r2, r2, #16
	blt             .Lneon_lt16
	vld1.32         {q8}, [r1]!
	vst1.32         {q8}, [r0]!
	beq             .Lneon_exit
.Lneon_lt16:
	movs            r12, r2, lsl #29
	bcc             .Lneon_skip8
	ldr             r3, [r1], #4
	ldr             r12, [r1], #4
	str             r3, [r0], #4
	str             r12, [r0], #4
.Lneon_skip8:
	bpl             .Lneon_lt4
	ldr             r3, [r1], #4
	str             r3, [r0], #4
.Lneon_lt4:
	movs            r2, r2, lsl #31
	bcc             .Lneon_lt2
	ldrh            r3, [r1], #2
	strh            r3, [r0], #2
.Lneon_lt2:
	bpl             .Lneon_exit
	ldrb            r12, [r1]
	strb            r12, [r0]
.Lneon_exit:
	pop             {r0}
	bx              lr
	.endfunc
	.end
#else /* !SCORPION_NEON_OPTIMIZATION */
        .text
        .fpu    neon

        .global memcpy
        .type memcpy, %function
        .align 4

/* a prefetch distance of 4 cache-lines works best experimentally */
#define CACHE_LINE_SIZE     32
#define PREFETCH_DISTANCE   (CACHE_LINE_SIZE*4)

memcpy:
        .fnstart
        .save       {r0, lr}
        stmfd       sp!, {r0, lr}

        /* start preloading as early as possible */
        pld         [r1, #(CACHE_LINE_SIZE*0)]
        pld         [r1, #(CACHE_LINE_SIZE*1)]

        /* do we have at least 16-bytes to copy (needed for alignment below) */
        cmp         r2, #16
        blo         5f

        /* align destination to half cache-line for the write-buffer */
        rsb         r3, r0, #0
        ands        r3, r3, #0xF
        beq         0f

        /* copy up to 15-bytes (count in r3) */
        sub         r2, r2, r3
        movs        ip, r3, lsl #31
        ldrmib      lr, [r1], #1
        strmib      lr, [r0], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1
        movs        ip, r3, lsl #29
        bge         1f
        // copies 4 bytes, destination 32-bits aligned
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
1:      bcc         2f
        // copies 8 bytes, destination 64-bits aligned
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0, :64]!
2:

0:      /* preload immediately the next cache line, which we may need */
        pld         [r1, #(CACHE_LINE_SIZE*0)]
        pld         [r1, #(CACHE_LINE_SIZE*1)]

        /* make sure we have at least 64 bytes to copy */
        subs        r2, r2, #64
        blo         2f

        /* preload all the cache lines we need.
         * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
         * ideally would would increase the distance in the main loop to
         * avoid the goofy code below. In practice this doesn't seem to make
         * a big difference.
         */
        pld         [r1, #(CACHE_LINE_SIZE*2)]
        pld         [r1, #(CACHE_LINE_SIZE*3)]
        pld         [r1, #(PREFETCH_DISTANCE)]

1:      /* The main loop copies 64 bytes at a time */
        vld1.8      {d0  - d3},   [r1]!
        vld1.8      {d4  - d7},   [r1]!
        pld         [r1, #(PREFETCH_DISTANCE)]
        subs        r2, r2, #64
        vst1.8      {d0  - d3},   [r0, :128]!
        vst1.8      {d4  - d7},   [r0, :128]!
        bhs         1b

2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
        add         r2, r2, #64
        subs        r2, r2, #32
        blo         4f

3:      /* 32 bytes at a time. These cache lines were already preloaded */
        vld1.8      {d0 - d3},  [r1]!
        subs        r2, r2, #32
        vst1.8      {d0 - d3},  [r0, :128]!
        bhs         3b

4:      /* less than 32 left */
        add         r2, r2, #32
        tst         r2, #0x10
        beq         5f
        // copies 16 bytes, 128-bits aligned
        vld1.8      {d0, d1}, [r1]!
        vst1.8      {d0, d1}, [r0, :128]!

5:      /* copy up to 15-bytes (count in r2) */
        movs        ip, r2, lsl #29
        bcc         1f
        vld1.8      {d0}, [r1]!
        vst1.8      {d0}, [r0]!
1:      bge         2f
        vld4.8      {d0[0], d1[0], d2[0], d3[0]}, [r1]!
        vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r0]!
2:      movs        ip, r2, lsl #31
        ldrmib      r3, [r1], #1
        ldrcsb      ip, [r1], #1
        ldrcsb      lr, [r1], #1
        strmib      r3, [r0], #1
        strcsb      ip, [r0], #1
        strcsb      lr, [r0], #1

        ldmfd       sp!, {r0, lr}
        bx          lr
        .fnend

#endif  /* !SCORPION_NEON_OPTIMIZATION */
#else   /* __ARM_ARCH__ < 7 */


	.text

    .global memcpy
    .type memcpy, %function
    .align 4

		/*
		 * Optimized memcpy() for ARM.
         *
		 * note that memcpy() always returns the destination pointer,
		 * so we have to preserve R0.
		 */

memcpy:
		/* The stack must always be 64-bits aligned to be compliant with the
		 * ARM ABI. Since we have to save R0, we might as well save R4
		 * which we can use for better pipelining of the reads below
		 */
        .fnstart
        .save       {r0, r4, lr}
        stmfd       sp!, {r0, r4, lr}
        /* Making room for r5-r11 which will be spilled later */
        .pad        #28
        sub         sp, sp, #28

        // preload the destination because we'll align it to a cache line
        // with small writes. Also start the source "pump".
        PLD         (r0, #0)
        PLD         (r1, #0)
        PLD         (r1, #32)

		/* it simplifies things to take care of len<4 early */
		cmp			r2, #4
		blo			copy_last_3_and_return

		/* compute the offset to align the source
		 * offset = (4-(src&3))&3 = -src & 3
		 */
		rsb			r3, r1, #0
		ands		r3, r3, #3
		beq			src_aligned

		/* align source to 32 bits. We need to insert 2 instructions between
		 * a ldr[b|h] and str[b|h] because byte and half-word instructions
		 * stall 2 cycles.
		 */
		movs		r12, r3, lsl #31
		sub			r2, r2, r3		/* we know that r3 <= r2 because r2 >= 4 */
		ldrmib		r3, [r1], #1
		ldrcsb		r4, [r1], #1
		ldrcsb		r12,[r1], #1
        strmib		r3, [r0], #1
		strcsb		r4, [r0], #1
		strcsb		r12,[r0], #1

src_aligned:

		/* see if src and dst are aligned together (congruent) */
		eor			r12, r0, r1
		tst			r12, #3
		bne			non_congruent

        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
         * frame. Don't update sp.
         */
        stmea		sp, {r5-r11}

		/* align the destination to a cache-line */
		rsb         r3, r0, #0
		ands		r3, r3, #0x1C
		beq         congruent_aligned32
		cmp         r3, r2
		andhi		r3, r2, #0x1C

		/* conditionnaly copies 0 to 7 words (length in r3) */
		movs		r12, r3, lsl #28
		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
		stmcsia		r0!, {r4, r5, r6, r7}
		stmmiia		r0!, {r8, r9}
		tst         r3, #0x4
		ldrne		r10,[r1], #4			/*  4 bytes */
		strne		r10,[r0], #4
		sub         r2, r2, r3

congruent_aligned32:
		/*
		 * here source is aligned to 32 bytes.
		 */

cached_aligned32:
        subs        r2, r2, #32
        blo         less_than_32_left

        /*
         * We preload a cache-line up to 64 bytes ahead. On the 926, this will
         * stall only until the requested world is fetched, but the linefill
         * continues in the the background.
         * While the linefill is going, we write our previous cache-line
         * into the write-buffer (which should have some free space).
         * When the linefill is done, the writebuffer will
         * start dumping its content into memory
         *
         * While all this is going, we then load a full cache line into
         * 8 registers, this cache line should be in the cache by now
         * (or partly in the cache).
         *
         * This code should work well regardless of the source/dest alignment.
         *
         */

        // Align the preload register to a cache-line because the cpu does
        // "critical word first" (the first word requested is loaded first).
        bic         r12, r1, #0x1F
        add         r12, r12, #64

1:      ldmia       r1!, { r4-r11 }
        PLD         (r12, #64)
        subs        r2, r2, #32

        // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
        // for ARM9 preload will not be safely guarded by the preceding subs.
        // When it is safely guarded the only possibility to have SIGSEGV here
        // is because the caller overstates the length.
        ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
        stmia       r0!, { r4-r11 }
		bhs         1b

        add         r2, r2, #32




less_than_32_left:
		/*
		 * less than 32 bytes left at this point (length in r2)
		 */

		/* skip all this if there is nothing to do, which should
		 * be a common case (if not executed the code below takes
		 * about 16 cycles)
		 */
		tst			r2, #0x1F
		beq			1f

		/* conditionnaly copies 0 to 31 bytes */
		movs		r12, r2, lsl #28
		ldmcsia		r1!, {r4, r5, r6, r7}	/* 16 bytes */
		ldmmiia		r1!, {r8, r9}			/*  8 bytes */
		stmcsia		r0!, {r4, r5, r6, r7}
		stmmiia		r0!, {r8, r9}
		movs		r12, r2, lsl #30
		ldrcs		r3, [r1], #4			/*  4 bytes */
		ldrmih		r4, [r1], #2			/*  2 bytes */
		strcs		r3, [r0], #4
		strmih		r4, [r0], #2
		tst         r2, #0x1
		ldrneb		r3, [r1]				/*  last byte  */
		strneb		r3, [r0]

		/* we're done! restore everything and return */
1:		ldmfd		sp!, {r5-r11}
		ldmfd		sp!, {r0, r4, lr}
		bx			lr

		/********************************************************************/

non_congruent:
		/*
		 * here source is aligned to 4 bytes
		 * but destination is not.
		 *
		 * in the code below r2 is the number of bytes read
		 * (the number of bytes written is always smaller, because we have
		 * partial words in the shift queue)
		 */
		cmp			r2, #4
		blo			copy_last_3_and_return

        /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
         * frame. Don't update sp.
         */
        stmea		sp, {r5-r11}

		/* compute shifts needed to align src to dest */
		rsb			r5, r0, #0
		and			r5, r5, #3			/* r5 = # bytes in partial words */
		mov			r12, r5, lsl #3		/* r12 = right */
		rsb			lr, r12, #32		/* lr = left  */

		/* read the first word */
		ldr			r3, [r1], #4
		sub			r2, r2, #4

		/* write a partial word (0 to 3 bytes), such that destination
		 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
		 */
		movs		r5, r5, lsl #31
		strmib		r3, [r0], #1
		movmi		r3, r3, lsr #8
		strcsb		r3, [r0], #1
		movcs		r3, r3, lsr #8
		strcsb		r3, [r0], #1
		movcs		r3, r3, lsr #8

		cmp			r2, #4
		blo			partial_word_tail

		/* Align destination to 32 bytes (cache line boundary) */
1:		tst			r0, #0x1c
		beq			2f
		ldr			r5, [r1], #4
		sub         r2, r2, #4
		orr			r4, r3, r5,		lsl lr
		mov			r3, r5,			lsr r12
		str			r4, [r0], #4
        cmp         r2, #4
		bhs			1b
		blo			partial_word_tail

		/* copy 32 bytes at a time */
2:		subs		r2, r2, #32
		blo			less_than_thirtytwo

		/* Use immediate mode for the shifts, because there is an extra cycle
		 * for register shifts, which could account for up to 50% of
		 * performance hit.
		 */

        cmp			r12, #24
		beq			loop24
		cmp			r12, #8
		beq			loop8

loop16:
        ldr         r12, [r1], #4
1:      mov         r4, r12
		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
        PLD         (r1, #64)
        subs        r2, r2, #32
        ldrhs       r12, [r1], #4
		orr			r3, r3, r4,		lsl #16
		mov			r4, r4,			lsr #16
		orr			r4, r4, r5,		lsl #16
		mov			r5, r5,			lsr #16
		orr			r5, r5, r6,		lsl #16
		mov			r6, r6,			lsr #16
		orr			r6, r6, r7,		lsl #16
		mov			r7, r7,			lsr #16
		orr			r7, r7, r8,		lsl #16
		mov			r8, r8,			lsr #16
		orr			r8, r8, r9,		lsl #16
		mov			r9, r9,			lsr #16
		orr			r9, r9, r10,	lsl #16
		mov			r10, r10,		lsr #16
		orr			r10, r10, r11,	lsl #16
		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
		mov			r3, r11,		lsr #16
		bhs			1b
		b			less_than_thirtytwo

loop8:
        ldr         r12, [r1], #4
1:      mov         r4, r12
		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
        PLD         (r1, #64)
		subs		r2, r2, #32
        ldrhs       r12, [r1], #4
		orr			r3, r3, r4,		lsl #24
		mov			r4, r4,			lsr #8
		orr			r4, r4, r5,		lsl #24
		mov			r5, r5,			lsr #8
		orr			r5, r5, r6,		lsl #24
		mov			r6, r6,			lsr #8
		orr			r6, r6, r7,		lsl #24
		mov			r7, r7,			lsr #8
		orr			r7, r7, r8,		lsl #24
		mov			r8, r8,			lsr #8
		orr			r8, r8, r9,		lsl #24
		mov			r9, r9,			lsr #8
		orr			r9, r9, r10,	lsl #24
		mov			r10, r10,		lsr #8
		orr			r10, r10, r11,	lsl #24
		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
		mov			r3, r11,		lsr #8
		bhs			1b
		b			less_than_thirtytwo

loop24:
        ldr         r12, [r1], #4
1:      mov         r4, r12
		ldmia		r1!, {   r5,r6,r7,  r8,r9,r10,r11}
        PLD         (r1, #64)
		subs		r2, r2, #32
        ldrhs       r12, [r1], #4
		orr			r3, r3, r4,		lsl #8
		mov			r4, r4,			lsr #24
		orr			r4, r4, r5,		lsl #8
		mov			r5, r5,			lsr #24
		orr			r5, r5, r6,		lsl #8
		mov			r6, r6,			lsr #24
		orr			r6, r6, r7,		lsl #8
		mov			r7, r7,			lsr #24
		orr			r7, r7, r8,		lsl #8
		mov			r8, r8,			lsr #24
		orr			r8, r8, r9,		lsl #8
		mov			r9, r9,			lsr #24
		orr			r9, r9, r10,	lsl #8
		mov			r10, r10,		lsr #24
		orr			r10, r10, r11,	lsl #8
		stmia		r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
		mov			r3, r11,		lsr #24
		bhs			1b


less_than_thirtytwo:
		/* copy the last 0 to 31 bytes of the source */
		rsb			r12, lr, #32		/* we corrupted r12, recompute it  */
		add			r2, r2, #32
		cmp			r2, #4
		blo			partial_word_tail

1:		ldr			r5, [r1], #4
		sub         r2, r2, #4
		orr			r4, r3, r5,		lsl lr
		mov			r3,	r5,			lsr r12
		str			r4, [r0], #4
        cmp         r2, #4
		bhs			1b

partial_word_tail:
		/* we have a partial word in the input buffer */
		movs		r5, lr, lsl #(31-3)
		strmib		r3, [r0], #1
		movmi		r3, r3, lsr #8
		strcsb		r3, [r0], #1
		movcs		r3, r3, lsr #8
		strcsb		r3, [r0], #1

		/* Refill spilled registers from the stack. Don't update sp. */
		ldmfd		sp, {r5-r11}

copy_last_3_and_return:
		movs		r2, r2, lsl #31	/* copy remaining 0, 1, 2 or 3 bytes */
		ldrmib		r2, [r1], #1
		ldrcsb		r3, [r1], #1
		ldrcsb		r12,[r1]
		strmib		r2, [r0], #1
		strcsb		r3, [r0], #1
		strcsb		r12,[r0]

        /* we're done! restore sp and spilled registers and return */
        add         sp,  sp, #28
		ldmfd		sp!, {r0, r4, lr}
		bx			lr
        .fnend


#endif    /* __ARM_ARCH__ < 7 */