inflate_fast_copy_neon.s

Steven Craft, 02/28/2014 09:26 AM

Download (20.1 KB)

 
1
@ Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
2
@
3
@ Redistribution and use in source and binary forms, with or without
4
@ modification, are permitted provided that the following conditions are
5
@ met:
6
@     * Redistributions of source code must retain the above copyright
7
@       notice, this list of conditions and the following disclaimer.
8
@     * Redistributions in binary form must reproduce the above
9
@       copyright notice, this list of conditions and the following
10
@       disclaimer in the documentation and/or other materials provided
11
@       with the distribution.
12
@     * Neither the name of Code Aurora Forum, Inc. nor the names of its
13
@       contributors may be used to endorse or promote products derived
14
@       from this software without specific prior written permission.
15
@
16
@ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
17
@ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18
@ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
19
@ ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
20
@ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21
@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22
@ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
23
@ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
24
@ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
25
@ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
26
@ IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27

    
28
@============================================================================
29
@  Code Section
30
        .code 32                                         @ Code is ARM ISA
31
@============================================================================
32

    
33
        .globl     _inflate_fast_copy_neon
34

    
35

    
36
@============================================================================
37
@       INPUTS:    r0       len:     number of bytes to transfer
38
@                  r1       **out:   pointer to pointer to ``out'' buffer
39
@                  r2       *from:   pointer to ``from'' buffer
40
@       OUTPUTS:   r1       **out:   pointer to pointer to ``out'' buffer
41
@============================================================================
42
.balign 32
43
@.type inflate_fast_copy_neon, %function
44
_inflate_fast_copy_neon:
45
       push       {r4-r11}             @ push r4-r11 onto stack
46

    
47
       cmp        r0,#16               @
48
       bge        inflate_fast_copy_vectorized
49

    
50
       @; transfer bytes one by one
51
       @; only if len < 16 bytes
52
inflate_fast_copy_default:
53

    
54
       cmp        r0,#0
55
       beq        inflate_fast_copy_exit
56

    
57
       ldr        r3,[r1,#0]           @ r3 = pointer to out
58

    
59
inflate_fast_copy_default_loop:
60

    
61
       ldrb       r12,[r2,#1]!         @ r12 = *(++from)
62
       subs       r0,r0,#1             @ len--
63
       strb       r12,[r3,#1]!         @ *(++out) = r12
64

    
65
       bne        inflate_fast_copy_default_loop
66

    
67
       str        r3,[r1,#0]           @ r1 = updated pointer to pointer
68
                                       @      to out
69
       b          inflate_fast_copy_exit
70

    
71
       @; vectorized copy routines
72
       @; only if len > 16 bytes
73
inflate_fast_copy_vectorized:
74

    
75
      ldr        r3,[r1,#0]            @ r3 = pointer to out
76
                                       @ DON'T TOUCH r1 UNTIL FINAL
77
                                       @  UPDATE OF r1 WITH ADDRESS OF r3
78
      cmp        r3,r2                 @
79
      sublt      r4,r2,r3              @
80
      subge      r4,r3,r2              @r4 = gap = |out-from|
81

    
82
      cmp        r4,#0
83
      beq        inflate_fast_copy_exit
84

    
85
      cmp        r4,#1
86
      beq        inflate_fast_copy_gap1b_proc
87

    
88
      cmp        r4,#2
89
      beq        inflate_fast_copy_gap2b_proc
90

    
91
      cmp        r4,#3
92
      beq        inflate_fast_copy_gap3b_proc
93

    
94
      cmp        r4,#4
95
      beq        inflate_fast_copy_gap4b_proc
96

    
97
      cmp        r4,#8
98
      blt        inflate_fast_copy_gap5to7b_proc
99
      beq        inflate_fast_copy_gap8b_proc
100

    
101
      cmp        r4,#16
102
      blt        inflate_fast_copy_gap9to15b_proc
103
      bge        inflate_fast_copy_gap16b_proc
104

    
105

    
106
      @; ------------------------------------------------------------------
107
      @; vectorized copy routine when gap between ``from'' and ``out''
108
      @;  buffers is 1 byte
109
      @; INPUTS:
110
      @;  r0 = len
111
      @;  r2 = pointer to from
112
      @;  r3 = pointer to out
113
      @; OUTPUTS:
114
      @;  r1 = pointer to pointer to out
115
      @; ------------------------------------------------------------------
116
inflate_fast_copy_gap1b_proc:
117

    
118
      add        r3,r3,#1                  @ out++
119
                                           @
120
      ldrb       r12,[r2,#1]!              @ r12 = *(++from)
121
      vdup.8     q0, r12                   @ duplicate r12 16 times in q0
122
                                           @
123
      lsrs       r4,r0,#4                  @ r4 = floor(len/16)
124
                                           @    = iteration count for loop16
125
      beq        inflate_fast_copy_gap1b_proc_16bytes_loop_done
126

    
127
inflate_fast_copy_gap1b_proc_16bytes_loop:
128

    
129
      vst1.8     {q0},[r3]!                @ store 16 bytes in out and
130
                                           @  increment out pointer
131
      sub        r0,r0,#16                 @ subtract 16 from len
132
      subs       r4,r4,#1                  @ decrement iteration count
133
      bne        inflate_fast_copy_gap1b_proc_16bytes_loop
134

    
135
inflate_fast_copy_gap1b_proc_16bytes_loop_done:
136

    
137
      cmp        r0,#0
138
      subeq      r3,r3,#1                  @ out--
139
      streq      r3,[r1,#0]                @ r1 = updated pointer to pointer
140
                                           @      to out
141
      beq        inflate_fast_copy_exit
142

    
143
inflate_fast_copy_gap1b_proc_lastfewbytes_loop:
144

    
145
      strb       r12,[r3],#1               @ *out = r12, out++
146
      subs       r0,r0,#1                  @ len--
147
      bne        inflate_fast_copy_gap1b_proc_lastfewbytes_loop
148

    
149
      sub        r3,r3,#1                  @ out--
150
      str        r3,[r1,#0]                @ r1 = updated pointer to pointer
151
                                           @      to out
152
      b          inflate_fast_copy_exit
153

    
154
      @; ------------------------------------------------------------------
155
      @; vectorized copy routine when gap between ``from'' and ``out''
156
      @;  buffers is 2 bytes
157
      @; INPUTS:
158
      @;  r0 = len
159
      @;  r2 = pointer to from
160
      @;  r3 = pointer to out
161
      @; OUTPUTS:
162
      @;  r1 = pointer to pointer to out
163
      @; ------------------------------------------------------------------
164
inflate_fast_copy_gap2b_proc:
165

    
166
      add        r2,r2,#1                @ from++
167
      add        r3,r3,#1                @ out++
168
                                         @
169
      vld1.16    {d0[0]},[r2]            @ load 2 bytes into d0[0]
170
      vdup.16    q0,d0[0]                @ duplicate those 2 bytes 8 times
171
                                         @  to fill up q0
172
                                         @
173
      lsrs       r4,r0,#4                @ r4 = floor(len/16)
174
                                         @    = iteration count for loop16
175
      beq        inflate_fast_copy_gap2b_proc_16bytes_loop_done
176

    
177
inflate_fast_copy_gap2b_proc_16bytes_loop:
178

    
179
      vst1.8     {q0},[r3]!              @ store 16 bytes in out and
180
                                         @  increment out pointer
181
      sub        r0,r0,#16               @ subtract 16 from len
182
      subs       r4,r4,#1                @ decrement iteration count
183
      bne        inflate_fast_copy_gap2b_proc_16bytes_loop
184

    
185
inflate_fast_copy_gap2b_proc_16bytes_loop_done:
186

    
187
      cmp        r0,#0
188
      subeq      r3,r3,#1                @ out--
189
      streq      r3,[r1,#0]              @ r1 = updated pointer to pointer
190
                                         @      to out
191
      beq        inflate_fast_copy_exit
192

    
193
inflate_fast_copy_gap2b_proc_lastfewbytes_loop:
194

    
195
      ldrb       r12,[r2],#1             @ r12 = *from, from++
196
      subs       r0,r0,#1                @ len--
197
      strb       r12,[r3],#1             @ *out = r12, out++
198
                                         @
199
      bne        inflate_fast_copy_gap2b_proc_lastfewbytes_loop
200

    
201
      sub        r3,r3,#1                @ out--
202
      str        r3,[r1,#0]              @ r1 = updated pointer to pointer
203
                                         @      to out
204
      b          inflate_fast_copy_exit
205

    
206
      @; ------------------------------------------------------------------
207
      @; vectorized copy routine when gap between ``from'' and ``out''
208
      @;  buffers is 3 bytes
209
      @; INPUTS:
210
      @;  r0 = len
211
      @;  r2 = pointer to from
212
      @;  r3 = pointer to out
213
      @;  r4 = 3
214
      @; OUTPUTS:
215
      @;  r1 = pointer to pointer to out
216
      @; ------------------------------------------------------------------
217
inflate_fast_copy_gap3b_proc:
218

    
219
      add        r2,r2,#1                @ from++
220
      add        r3,r3,#1                @ out++
221
                                         @
222
      vld1.32    {d0[0]},[r2]            @ load 4 bytes into d0[0]
223

    
224
inflate_fast_copy_gap3b_proc_3bytes_loop:
225

    
226
      cmp        r0,#3                   @ exit loop if len < 3
227
      blt        inflate_fast_copy_gap3b_proc_3bytes_loop_done
228

    
229
      vst1.32    {d0[0]},[r3],r4         @ store 4 bytes in out
230
                                         @ out+=3
231

    
232
      sub        r0,r0,#3                @ len-=3
233
      b          inflate_fast_copy_gap3b_proc_3bytes_loop
234

    
235
inflate_fast_copy_gap3b_proc_3bytes_loop_done:
236

    
237
      cmp        r0,#0
238
      subeq      r3,r3,#1                @ out--
239
      streq      r3,[r1,#0]              @ r1 = updated pointer to pointer
240
                                         @      to out
241
      beq        inflate_fast_copy_exit
242

    
243
inflate_fast_copy_gap3b_proc_lastfewbytes_loop:
244

    
245
      ldrb       r12,[r2],#1             @ r12 = *from, from++
246
      subs       r0,r0,#1                @ len--
247
      strb       r12,[r3],#1             @ *out = r12, out++
248

    
249
      bne        inflate_fast_copy_gap3b_proc_lastfewbytes_loop
250

    
251
      sub        r3,r3,#1                @ out--
252
      str        r3,[r1,#0]              @ r1 = updated pointer to pointer
253
                                         @      to out
254
      b          inflate_fast_copy_exit
255

    
256
      @; ------------------------------------------------------------------
257
      @; vectorized copy routine when gap between ``from'' and ``out''
258
      @;  buffers is 4 bytes
259
      @; INPUTS:
260
      @;  r0 = len
261
      @;  r2 = pointer to from
262
      @;  r3 = pointer to out
263
      @; OUTPUTS:
264
      @;  r1 = pointer to pointer to out
265
      @; ------------------------------------------------------------------
266
inflate_fast_copy_gap4b_proc:
267

    
268
      add        r2,r2,#1               @ from++
269
      add        r3,r3,#1               @ out++
270
                                        @
271
      vld1.32    {d0[0]},[r2]           @ load 4 bytes into d0[0]
272
      vdup.32    q0,d0[0]               @ duplicate those 4 bytes 4 times
273
                                        @  to fill up q0
274
                                        @
275
      lsrs       r4,r0,#4               @ r4 = floor(len/16)
276
                                        @    = iteration count for loop16
277
      beq        inflate_fast_copy_gap4b_proc_16bytes_loop_done
278

    
279
inflate_fast_copy_gap4b_proc_16bytes_loop:
280

    
281
      vst1.32    {q0},[r3]!             @ store 16 bytes in out and
282
                                        @  increment out pointer
283
      sub        r0,r0,#16              @ subtract 16 from len
284
      subs       r4,r4,#1               @ decrement iteration count
285
      bne        inflate_fast_copy_gap4b_proc_16bytes_loop
286

    
287
inflate_fast_copy_gap4b_proc_16bytes_loop_done:
288

    
289
      cmp        r0,#0
290
      subeq      r3,r3,#1               @ out--
291
      streq      r3,[r1,#0]             @ r1 = updated pointer to pointer
292
                                        @      to out
293
      beq        inflate_fast_copy_exit
294

    
295
inflate_fast_copy_gap4b_proc_lastfewbytes_loop:
296

    
297
      ldrb       r12,[r2],#1            @ r12 = *from, from++
298
      subs       r0,r0,#1               @ len--
299
      strb       r12,[r3],#1            @ *out = r12, out++
300

    
301
      bne        inflate_fast_copy_gap4b_proc_lastfewbytes_loop
302

    
303
      sub        r3,r3,#1               @ out--
304
      str        r3,[r1,#0]             @ r1 = updated pointer to pointer
305
                                        @      to out
306
      b          inflate_fast_copy_exit
307

    
308
      @; ------------------------------------------------------------------
309
      @; vectorized copy routine when gap between ``from'' and ``out''
310
      @;  buffers is {5-7} bytes
311
      @; INPUTS:
312
      @;  r0 = len
313
      @;  r2 = pointer to from
314
      @;  r3 = pointer to out
315
      @;  r4 = {5-7}
316
      @; OUTPUTS:
317
      @;  r1 = pointer to pointer to out
318
      @; ------------------------------------------------------------------
319
inflate_fast_copy_gap5to7b_proc:
320

    
321
      add        r2,r2,#1                @ from++
322
      add        r3,r3,#1                @ out++
323
                                         @
324
      vld1.8     {d0},[r2]               @ load 8 bytes into d0
325

    
326
inflate_fast_copy_gap5to7b_proc_5to7bytes_loop:
327

    
328
      cmp        r0,r4                   @ exit loop if len < {5-7}
329
      blt        inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done
330

    
331
      vst1.8     {d0},[r3],r4            @ store 8 bytes in out
332
                                         @ out+={5-7}
333

    
334
      sub        r0,r0,r4                @ len-={5-7}
335
      b          inflate_fast_copy_gap5to7b_proc_5to7bytes_loop
336

    
337
inflate_fast_copy_gap5to7b_proc_5to7bytes_loop_done:
338

    
339
      cmp        r0,#0
340
      subeq      r3,r3,#1                @ out--
341
      streq      r3,[r1,#0]              @ r1 = updated pointer to pointer
342
                                         @      to out
343
      beq        inflate_fast_copy_exit
344

    
345
inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop:
346

    
347
      ldrb       r12,[r2],#1             @ r12 = *from, from++
348
      subs       r0,r0,#1                @ len--
349
      strb       r12,[r3],#1             @ *out = r12, out++
350

    
351
      bne        inflate_fast_copy_gap5to7b_proc_lastfewbytes_loop
352

    
353
      sub        r3,r3,#1                @ out--
354
      str        r3,[r1,#0]              @ r1 = updated pointer to pointer
355
                                         @      to out
356
      b          inflate_fast_copy_exit
357

    
358
      @; ------------------------------------------------------------------
359
      @; vectorized copy routine when gap between ``from'' and ``out''
360
      @;  buffers is 8 bytes
361
      @; INPUTS:
362
      @;  r0 = len
363
      @;  r2 = pointer to from
364
      @;  r3 = pointer to out
365
      @; OUTPUTS:
366
      @;  r1 = pointer to pointer to out
367
      @; ------------------------------------------------------------------
368
inflate_fast_copy_gap8b_proc:
369

    
370
      add        r2,r2,#1              @ from++
371
      add        r3,r3,#1              @ out++
372
                                       @
373
      vld1.8     {d0},[r2]             @ load 8 bytes into d0
374
      vmov       d1,d0                 @ duplicate the 8 bytes to fill up
375
                                       @  q0
376
                                       @
377
      lsrs       r4,r0,#4              @ r4 = floor(len/16)
378
                                       @    = iteration count for loop16
379
      beq        inflate_fast_copy_gap8b_proc_16bytes_loop_done
380

    
381
inflate_fast_copy_gap8b_proc_16bytes_loop:
382

    
383
      vst1.8     {q0},[r3]!           @ store 16 bytes in out and
384
                                      @  increment out pointer
385
      sub        r0,r0,#16            @ subtract 16 from len
386
      subs       r4,r4,#1             @ decrement iteration count
387
      bne        inflate_fast_copy_gap8b_proc_16bytes_loop
388

    
389
inflate_fast_copy_gap8b_proc_16bytes_loop_done:
390

    
391
      cmp        r0,#0
392
      subeq      r3,r3,#1             @ out--
393
      streq      r3,[r1,#0]           @ r1 = updated pointer to pointer
394
                                      @      to out
395
      beq        inflate_fast_copy_exit
396

    
397
inflate_fast_copy_gap8b_proc_lastfewbytes_loop:
398

    
399
      ldrb       r12,[r2],#1          @ r12 = *from, from++
400
      subs       r0,r0,#1             @ len--
401
      strb       r12,[r3],#1          @ *out = r12, out++
402

    
403
      bne        inflate_fast_copy_gap8b_proc_lastfewbytes_loop
404

    
405
      sub        r3,r3,#1             @ out--
406
      str        r3,[r1,#0]           @ r1 = updated pointer to pointer
407
                                      @      to out
408
      b          inflate_fast_copy_exit
409

    
410
      @; ------------------------------------------------------------------
411
      @; vectorized copy routine when gap between ``from'' and ``out''
412
      @;  buffers is {9-15} bytes
413
      @; INPUTS:
414
      @;  r0 = len
415
      @;  r2 = pointer to from
416
      @;  r3 = pointer to out
417
      @;  r4 = {9-15}
418
      @; OUTPUTS:
419
      @;  r1 = pointer to pointer to out
420
      @; ------------------------------------------------------------------
421
inflate_fast_copy_gap9to15b_proc:
422

    
423
      add        r2,r2,#1            @ from++
424
      add        r3,r3,#1            @ out++
425
                                     @
426
      vld1.8     {q0},[r2]           @ load 16 bytes into q0
427

    
428
inflate_fast_copy_gap9to15b_proc_9to15bytes_loop:
429

    
430
      cmp        r0, r4              @ exit loop if len < {9-15}
431
      blt        inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done
432

    
433
      vst1.8     {q0},[r3],r4        @ store 16 bytes in out
434
                                     @ out+={9-15}
435

    
436
      sub        r0,r0,r4            @ len-={9-15}
437
      b          inflate_fast_copy_gap9to15b_proc_9to15bytes_loop
438

    
439
inflate_fast_copy_gap9to15b_proc_9to15bytes_loop_done:
440

    
441
     cmp        r0,#0
442
     subeq      r3,r3,#1             @ out--
443
     streq      r3,[r1,#0]           @ r1 = updated pointer to pointer
444
                                     @      to out
445
     beq        inflate_fast_copy_exit
446

    
447
inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop:
448

    
449
     ldrb       r12,[r2],#1          @ r12 = *from, from++
450
     subs       r0,r0,#1             @ len--
451
     strb       r12,[r3],#1          @ *out = r12, out++
452

    
453
     bne        inflate_fast_copy_gap9to15b_proc_lastfewbytes_loop
454

    
455
     sub        r3,r3,#1             @ out--
456
     str        r3,[r1,#0]           @ r1 = updated pointer to pointer
457
                                     @      to out
458
     b          inflate_fast_copy_exit
459

    
460
     @; ------------------------------------------------------------------
461
     @; vectorized copy routine when gap between ``from'' and ``out''
462
     @;  buffers is 16 bytes or more
463
     @; INPUTS:
464
     @;  r0 = len
465
     @;  r2 = pointer to from
466
     @;  r3 = pointer to out
467
     @; OUTPUTS:
468
     @;  r1 = pointer to pointer to out
469
     @; ------------------------------------------------------------------
470
inflate_fast_copy_gap16b_proc:
471

    
472
     add        r2,r2,#1             @ from++
473
     add        r3,r3,#1             @ out++
474
                                     @
475
     lsrs       r4,r0,#4             @ r4 = floor(len/16)
476
                                     @    = iteration count for loop16
477
     beq        inflate_fast_copy_gap16b_proc_16bytes_loop_done
478

    
479
inflate_fast_copy_gap16b_proc_16bytes_loop:
480

    
481
     vld1.8     {q0},[r2]!           @ load 16 bytes into q0 and
482
                                     @  increment from pointer
483
     vst1.8     {q0},[r3]!           @ store 16 bytes in out and
484
                                     @  increment out pointer
485
     sub        r0,r0,#16            @ subtract 16 from len
486
     subs       r4,r4,#1             @ decrement iteration count
487
     bne        inflate_fast_copy_gap16b_proc_16bytes_loop
488

    
489
inflate_fast_copy_gap16b_proc_16bytes_loop_done:
490

    
491
     cmp        r0,#0
492
     subeq      r3,r3,#1             @ out--
493
     streq      r3,[r1,#0]           @ r1 = updated pointer to pointer
494
                                     @      to out
495
     beq        inflate_fast_copy_exit
496

    
497
inflate_fast_copy_gap16b_proc_lastfewbytes_loop:
498

    
499
     ldrb       r12,[r2],#1          @ r12 = *from, from++
500
     subs       r0,r0,#1             @ len--
501
     strb       r12,[r3],#1          @ *out = r12, out++
502

    
503
     bne        inflate_fast_copy_gap16b_proc_lastfewbytes_loop
504

    
505
     sub        r3,r3,#1             @ out--
506
     str        r3,[r1,#0]           @ r1 = updated pointer to pointer
507
                                     @      to out
508

    
509
inflate_fast_copy_exit:
510

    
511
      pop        {r4-r11}            @ pop r4-r11 from stack
512
      bx         lr                  @ return
513

    
514
@.size inflate_fast_copy_neon,  .-inflate_fast_copy_neon
515

    
516

    
517
@        .END