-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
armgfx.s
2406 lines (2378 loc) · 88.1 KB
/
armgfx.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@ TITLE("larrys_graphics")
@ ARM asm code for SmartGear
@ Copyright (c) 2000-2017 BitBank Software, Inc.
@ Written by Larry Bank
@
@ Change Log
@ 12/5/2008 - added PLD instructions to speed things up on XScale and Qualcomm MSM7200
@ these are ignored on OMAP and Samsung CPUs
@ 1/24/2009 - changed stretchblt ratio to 65536 for more accuracy
@ 4/23/2009 - added support for Treo 800w. It has a pitch of 0x1000, but does
@ not have the "venetian blinds" mapping of the Motorola Q9h
@
@ This program is free software: you can redistribute it and/or modify
@ it under the terms of the GNU General Public License as published by
@ the Free Software Foundation, either version 3 of the License, or
@ (at your option) any later version.
@
@ This program is distributed in the hope that it will be useful,
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
@ GNU General Public License for more details.
@
@ You should have received a copy of the GNU General Public License
@ along with this program. If not, see <http://www.gnu.org/licenses/>.
@
@ EXPORT ARMGFXCopy16
.global ARMDraw2X
.global SNESDrawSprite
.global ARMDrawTile
.global ARMDrawTileOpaque
.global ASMBLT150S
.global ASMBLT100
.global ASMBLT100_270
.global ASMBLT100_90
.global ASMBLT200HQ
.global ASMBLT200
.global ASMBLT200HQ_90
.global ASMBLT200_90
.global ASMBLT200_90a
.global ASMBLT200_90b
.global ASMBLT200_270
.global ASMBLT300
.global ASMBLT300HQ
.global ASMBLT300_90
.global ASMBLT300_270
.global ASMBLT150S_V6
.global ASMBLT75S
.global ASMBLT50S
.global ASMBLT150S270
.global ARMBLTFTW
.global ARMV6TEST
.global ARMPERFTEST
.global ARMDrawCoinOpSprite16
.global ARMDrawCoinOpTile16
.global G88DrawCharOpaque
@
@ call from C as void void G88DrawCharOpaque(int sx,int sy,int iChar,int iColor,unsigned char *pData, int iPitch, unsigned char *pBitmap)
@ draw an 8x8 fully opaque character
@
G88DrawCharOpaque:
stmfd sp!,{r4-r12,lr}
ldr r4,[sp,#40] @ source data
ldr r5,[sp,#44] @ pitch
ldr r6,[sp,#48] @ dest data
add r4,r4,r2,LSL #6 @ point to correct source image (64 * iChar)
mul r7,r1,r5 @ pDest += y * iPitch
add r6,r6,r0,LSL #1 @ pDest += x*2
add r6,r6,r7
mov r0,#0xff @ mask to extract pixels
mov r14,#8 @ number of lines to do
tst r6,#3 @ dword aligned address?
bne g88drawslow
g88drawfast:
ldmia r4!,{r7,r8} @ read 8 source pixels
and r9,r0,r7 @ first pixel
and r10,r0,r7,LSR #8 @ second pixel
add r9,r9,r3 @ + color
add r10,r10,r3
and r11,r0,r7,LSR #16 @ third pixel
orr r9,r9,r10,LSL #16 @ combine first 2 pixels
and r10,r0,r7,LSR #24 @ forth pixel
add r11,r11,r3 @ + color
add r10,r10,r3
orr r10,r11,r10,LSL #16 @ combine second 2 pixels
and r1,r0,r8 @ fifth pixel
and r2,r0,r8,LSR #8 @ sixth pixel
add r1,r1,r3 @ + color
add r2,r2,r3
and r12,r0,r8,LSR #16 @ seventh pixel
orr r11,r1,r2,LSL #16 @ combine pixels 5+6
and r1,r0,r8,LSR #24 @ eighth pixel
add r12,r12,r3 @ + color
add r1,r1,r3
orr r12,r12,r1,LSL #16 @ combine pixels 7+8
stmia r6,{r9-r12} @ store 8 pixels
add r6,r6,r5 @ pDest += iPitch
subs r14,r14,#1 @ while (--y)
bne g88drawfast
g88drawexit:
ldmfd sp!,{r4-r12,pc}
@ need to draw it 1 pixel at a time
g88drawslow:
ldmia r4!,{r7,r8} @ read 8 source pixels
and r9,r0,r7 @ first pixel
and r10,r0,r7,LSR #8 @ second pixel
add r9,r9,r3 @ + color
add r10,r10,r3
strh r9,[r6],#2
strh r10,[r6],#2
and r11,r0,r7,LSR #16 @ third pixel
and r10,r0,r7,LSR #24 @ forth pixel
add r11,r11,r3 @ + color
add r10,r10,r3
strh r11,[r6],#2
strh r10,[r6],#2
and r1,r0,r8 @ fifth pixel
and r2,r0,r8,LSR #8 @ sixth pixel
add r1,r1,r3 @ + color
add r2,r2,r3
strh r1,[r6],#2
strh r2,[r6],#2
and r12,r0,r8,LSR #16 @ seventh pixel
and r1,r0,r8,LSR #24 @ eighth pixel
add r12,r12,r3 @ + color
add r1,r1,r3
strh r12,[r6],#2
strh r1,[r6],#2
add r6,r6,r5 @ pDest += iPitch
sub r6,r6,#16
subs r14,r14,#1 @ while (--y)
bne g88drawslow
b g88drawexit
@
@ call from C as void ARMDrawCoinOpTile16(unsigned char *p, unsigned char *d, int xCount, int yCount, int iColor, int iCoinOpPitch, int iSize);
@
ARMDrawCoinOpTile16:
stmfd sp!,{r4-r10}
ldr r6,[sp,#28] @ color offset
ldr r7,[sp,#32] @ dest pitch
ldr r8,[sp,#36] @ source pitch
drawcotile00:
mov r10,r2 @ xcount
drawcotile01:
ldrb r9,[r0],#1 @ source pixel
subs r10,r10,#1 @ while (xcount--)
add r9,r9,r6 @ c + color
strh r9,[r1],#2 @ if !(ulTransMask & 1<<c)
bne drawcotile01
@ prepare for next line
add r0,r0,r8 @ p += iSize
sub r0,r0,r2 @ -xCount
add r1,r1,r7 @ d += iPitch
sub r1,r1,r2,LSL #1 @ adjust for bytes written
subs r3,r3,#1 @ while (ycount--)
bne drawcotile00
ldmfd sp!,{r4-r10}
bx lr
@
@ call from C as void ARMDrawCoinOpSprite16(unsigned char *p, unsigned char *d, int xCount, int yCount, unsigned long ulTransMask, int iColor, int iCoinOpPitch, int iSize);
@
ARMDrawCoinOpSprite16:
stmfd sp!,{r5-r11}
ldr r5,[sp,#28] @ transmask
ldr r6,[sp,#32] @ color offset
ldr r7,[sp,#36] @ dest pitch
ldr r8,[sp,#40] @ source pitch
mov r11,#1 @ test bit
drawcospr00:
mov r10,r2 @ xcount
drawcospr01:
ldrb r9,[r0],#1 @ source pixel
tst r5,r11,LSL r9 @ transparent color
add r9,r9,r6 @ c + color
streqh r9,[r1] @ if !(ulTransMask & 1<<c)
add r1,r1,#2
subs r10,r10,#1 @ while (xcount--)
bne drawcospr01
@ prepare for next line
add r0,r0,r8 @ p += iSize
sub r0,r0,r2 @ -xCount
add r1,r1,r7 @ d += iPitch
sub r1,r1,r2,LSL #1 @ adjust for bytes written
subs r3,r3,#1 @ while (ycount--)
bne drawcospr00
ldmfd sp!,{r5-r11}
bx lr
@
@ Structure offsets for ScaleStruct
@
.equ srcptr, 0
.equ srcpitch, 4
.equ srcwidth, 8
.equ srcheight, 12
.equ destptr, 16
.equ destpitch, 20
.equ destwidth, 24
.equ destheight, 28
.equ scalex, 32
.equ scaley, 36
.equ destx, 40
.equ desty, 44
.equ leftdestptr, 48 @ for drawing in the "right" direction
ARMV6TEST:
mrs r0, cpsr @ check if we run in non-user mode
and r0, r0, #15
cmp r0, #15
mov r0, #0
bne user_mode
mrc p15, 0, r0, c0, c0, 0 @ read main id cp15 register 0
user_mode:
tst r0,#0x900 @ if either bit is set, we have ARMv6 instruction set
movne r0,#1 @ true
moveq r0,#0
mov pc, lr
@
@ Call from C as ARMPERFTEST(s, d, len)
@ tested on Qualcomm M7200 with 1MB buffer and 20 iterations
@ ldmia all 8 registers at once yielded 362ms
@ ldmia 4 registers at a time yielded 349ms
@ adding PLD instruction dropped it to 225ms
@
ARMPERFTEST:
stmfd sp!,{r4-r12,lr}
@ sub r2,r2,#4
mov r2,r2,LSR #5
perfloop1:
ldr r4,[r0],#4
subs r2,r2,#1
@ str r4,[r1],#32
stmia r1,{r4,r5}
add r1,r1,#32
bne perfloop1
b perfexit
mov r2,r2,lsr #4 @ 16 bytes per iteration
perfloop0:
ldmia r0!,{r4,r5,r6,r7}
@ ldmia r0!,{r4-r11}
pld [r0,#0x40]
subs r2,r2,#1
stmia r1!,{r4,r5,r6,r7}
@ stmia r1!,{r4-r11}
@ ldmia r0!,{r8,r9,r10,r11}
@ stmia r1!,{r8,r9,r10,r11}
bne perfloop0
perfexit:
ldmia sp!,{r4-r12,pc}
@
@ stretch or shrink 16-bit pixels from a src bitmap to a dest
@ ARMBLTFTW (added 2/7/07) - scale an image larger or smaller (w/o pixel averaging)
@ call from C as ARMBLTFTW(&dftw, 270)@
@
ARMBLTFTW:
stmfd sp!,{r4-r12,lr}
mov r2,#255
add r2,r2,#15 @ make 270
cmp r1,r2 @ display angle = 270?
beq bltftw_270
cmp r1,#90
beq bltftw_90
cmp r1,#0
ldmneia sp!,{r4-r12,pc} @ invalid angle, leave
@ angle 0 case
bltftw_0:
mov r2,#0 @ ysum
ldr r12,[r0,#destheight]
ldr r14,[r0,#scaley]
ldr r10,[r0,#destpitch]
bltftw_0_top:
mov r4,r2,LSR #16 @ source y
ldr r5,[r0,#destptr]
ldr r6,[r0,#srcptr]
ldr r1,[r0,#destheight]
ldr r7,[r0,#srcpitch]
ldr r11,[r0,#destx]
ldr r8,[r0,#desty]
mov r9,r1 @ save height to check if we're on a Treo 800w
sub r1,r1,r12 @ how far down we are (ysize-ycount)
add r5,r5,r11,LSL #1 @ add destination X offset
add r1,r1,r8 @ add destination Y offset
cmp r9,#241 @ if >= 241, it's a treo 800w
bge blt_notq9h
cmp r10,#0x1000 @ special case for Q9H
andeq r11,r1,#63 @ do special calc for dest address
addeq r5,r5,r11,LSL #12 @ in 64-line groups, pitch is 0x1000
moveq r11,r1,LSR #6 @ get the group #
moveq r1,#0x280 @ offset to each 64-line group
muleq r8,r1,r11
blt_notq9h:
mulne r8,r10,r1
add r5,r5,r8 @ point to start of dest line
mul r8,r7,r4 @ point to start of source line
add r6,r6,r8
ldr r7,[r0,#scalex]
mov r4,#0 @ xsum
ldr r11,[r0,#destwidth]
bic r11,r11,#1 @ make sure it's an even number of destination pixels
bltftw_0_loop0: @ inner loop
@ do 4 dest pixels at a time for maximum
@ utilization of the ARM write buffers
mov r8,r4,LSR #15 @ scaled source pixel
bic r8,r8,#1 @ shorts
ldrh r1,[r6,r8] @ source pixel
add r4,r4,r7 @ xsum += xscale
mov r8,r4,LSR #15 @ scaled source pixel
bic r8,r8,#1 @ shorts
ldrh r9,[r6,r8] @ source pixel
add r4,r4,r7 @ xsum += xscale
orr r3,r1,r9,LSL #16 @ merge 2 pixels for better speed
mov r8,r4,LSR #15 @ scaled source pixel
bic r8,r8,#1 @ shorts
ldrh r1,[r6,r8] @ source pixel
add r4,r4,r7 @ xsum += xscale
mov r8,r4,LSR #15 @ scaled source pixel
bic r8,r8,#1 @ shorts
ldrh r9,[r6,r8] @ source pixel
add r4,r4,r7 @ xsum += xscale
subs r11,r11,#4 @ decrement destination count
orr r9,r1,r9,LSL #16 @ merge 2 pixels for better speed
stmia r5!,{r3,r9} @ store and advance pointer
bne bltftw_0_loop0
@ advance y
add r2,r2,r14 @ ysum += yscale
subs r12,r12,#1 @ decrement y count
bne bltftw_0_top
b bltftw_done
@
@ Display rotated 90 case
@
bltftw_90:
ldr r2,[r0,#srcwidth]
cmp r2,#160
beq bltftw_90a @ other method is faster for small src
mov r2,#0 @ ysum
ldr r12,[r0,#destheight]
ldr r14,[r0,#scaley]
ldr r10,[r0,#destpitch]
bltftw_90_top:
mov r4,r2,LSR #16 @ source y
ldr r5,[r0,#leftdestptr]
ldr r6,[r0,#srcptr]
ldr r1,[r0,#destheight]
ldr r7,[r0,#srcpitch]
ldr r9,[r0,#srcheight]
sub r1,r1,r12 @ how far down we are (ysize-ycount)
add r5,r5,r1,LSL #1 @ point to start of dest line
sub r9,r9,#1
sub r4,r9,r4
mul r8,r7,r4 @ point to start of source line (start from bottom)
add r6,r6,r8
mov r9,r2,LSL #16
adds r9,r9,r14,LSL #16 @ see if the next line jumps on the src
mov r4,r6 @ second source
subcs r4,r4,r7 @ need to move up 1 line
ldr r7,[r0,#scalex]
ldr r11,[r0,#destwidth]
ldr r9,[r6],#4 @ get 2 source pixels
ldr r8,[r4],#4 @ and from "next" line
sub r11,r11,#1 @ since we draw on the 0th count
mov r3,#2 @ pixels available
bltftw_90_loop0: @ inner loop
adds r7,r7,r7,LSL #16 @ xsum += xscale
movcs r9,r9,LSR #16 @ get next pixel (lower)
movcs r8,r8,LSR #16 @ get next pixel (upper)
subcss r3,r3,#1
ldreq r9,[r6],#4
ldreq r8,[r4],#4
moveq r3,#2
subs r11,r11,#1 @ decrement destination count
@ create a double pixel to store
mov r1,r8,LSL #16 @ "top"
mov r14,r9,LSL #16 @ "bottom"
orr r1,r1,r14,LSR #16 @ combine
str r1,[r5],r10
bne bltftw_90_loop0
@ advance y
ldr r14,[r0,#scaley]
subs r12,r12,#2 @ decrement y count
add r2,r2,r14,LSL #1 @ ysum += yscale*2 (we drew 2 lines)
bgt bltftw_90_top
b bltftw_done
bltftw_90a:
ldr r10,[r0,#scalex] @ src and delta
mov r12,#0 @ dest y
ldr r5,[r0,#srcheight]
ldr r6,[r0,#srcpitch]
sub r5,r5,#1
mul r7,r5,r6 @ start at "bottom left" of src
ldr r6,[r0,#srcptr]
add r3,r6,r7 @ now r3 points to bottom left of src
bltftw_90_topa:
mov r6,r3 @ source ptr
ldr r2,[r0,#destpitch]
ldr r5,[r0,#leftdestptr]
ldr r7,[r0,#srcpitch]
rsb r7,r7,#0 @ negative pitch
mul r8,r2,r12 @ point to start of dest line
add r5,r5,r8 @ R5 points to start of dest line
bic r5,r5,#3 @ we have to be on a DWORD boundary
ldr r8,[r0,#scaley]
ldr r11,[r0,#destheight]
adds r4,r10,r10,LSL #16 @ see if this line and the next will be the same
bcs bltftw_90_loop0a @ no, we have to touch that memory
ldr r4,[r0,#destwidth]
sub r4,r4,r12 @ see if we have at least 1 line left
cmp r4,#1
ble bltftw_90_loop0a @ no, last line
@ we can do it a little faster by drawing both
@ current and next line at the same time
add r2,r5,r2 @ point R2 to the next line
bltftw_90_loop1a:
ldrh r9,[r6] @ first source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
ldrh r14,[r6] @ second source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
orr r4,r9,r14,LSL #16 @ combine 2 pixels
ldrh r9,[r6] @ third source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
ldrh r14,[r6] @ fourth source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
pld [r6,r7] @ preload next line
subs r11,r11,#4 @ decrement destination count
orr r9,r9,r14,LSL #16 @ combine 2 pixels
stmia r5!,{r4,r9}
stmia r2!,{r4,r9} @ and the line below
bgt bltftw_90_loop1a
add r12,r12,#1 @ we did an extra line
adds r10,r10,r10,LSL #16 @ update y sum for the extra line
b bltftw_90n @ next line
@ have to draw a single line
bltftw_90_loop0a: @ inner loop
ldrh r9,[r6] @ first source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
ldrh r14,[r6] @ second source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
orr r4,r9,r14,LSL #16 @ combine 2 pixels
ldrh r9,[r6] @ third source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
ldrh r14,[r6] @ fourth source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
pld [r6,r7] @ preload next line
subs r11,r11,#4 @ decrement destination count
orr r9,r9,r14,LSL #16 @ combine 2 pixels
stmia r5!,{r4,r9}
bgt bltftw_90_loop0a
@ Advance fractional src x and whole dest y
bltftw_90n: @ next line
adds r10,r10,r10,LSL #16 @ src xsum += xscale
addcs r3,r3,#2 @ advance src 1 pixel "right"
ldr r4,[r0,#destwidth]
add r12,r12,#1 @ inc dest y
cmp r12,r4 @ done with whole image?
bne bltftw_90_topa
b bltftw_done
@
@ Display rotated 270 case
@
bltftw_270:
@ ldr r2,[r0,#srcwidth]
@ cmp r2,#160
@ beq bltftw_270a @ other method is faster for small src
mov r2,#0 @ ysum
ldr r12,[r0,#destheight]
ldr r14,[r0,#scaley]
ldr r10,[r0,#destpitch]
rsb r10,r10,#0 @ negate dest pitch
bltftw_270_top:
mov r4,r2,LSR #16 @ source y
ldr r5,[r0,#leftdestptr]
ldr r6,[r0,#srcptr]
ldr r1,[r0,#destheight]
ldr r7,[r0,#srcpitch]
ldr r9,[r0,#destwidth]
sub r1,r1,r12 @ how far down we are (ysize-ycount)
add r5,r5,r1,LSL #1 @ point to start of dest line
sub r9,r9,#1 @ start at bottom of dest
rsb r1,r10,#0 @ get positive dest pitch
mul r8,r9,r1
add r5,r5,r8 @ now r5 points to correct dest pixel
mul r8,r7,r4 @ point to start of source line (start from top)
add r6,r6,r8 @ now r6 points to correct src pixel
mov r9,r2,LSL #16
adds r9,r9,r14,LSL #16 @ see if the next line jumps on the src
mov r4,r6 @ second source
addcs r4,r4,r7 @ need to down up 1 line
ldr r7,[r0,#scalex]
ldr r11,[r0,#destwidth]
ldr r9,[r6],#4 @ get 2 source pixels
ldr r8,[r4],#4 @ and from "next" line
sub r11,r11,#1 @ since we draw on the 0th count
mov r3,#2 @ pixels available
bltftw_270_loop0: @ inner loop
adds r7,r7,r7,LSL #16 @ xsum += xscale
movcs r9,r9,LSR #16 @ get next pixel (upper)
movcs r8,r8,LSR #16 @ get next pixel (lower)
subcss r3,r3,#1
ldreq r9,[r6],#4
ldreq r8,[r4],#4
moveq r3,#2
subs r11,r11,#1 @ decrement destination count
@ create a double pixel to store
mov r1,r8,LSL #16 @ "top"
mov r14,r9,LSL #16 @ "bottom"
orr r1,r1,r14,LSR #16 @ combine
str r1,[r5],r10 @ r10 is negative, so we work our way up the screen
bne bltftw_270_loop0
@ advance y
ldr r14,[r0,#scaley]
subs r12,r12,#2 @ decrement y count
add r2,r2,r14,LSL #1 @ ysum += yscale*2 (we drew 2 lines)
bgt bltftw_270_top
b bltftw_done
bltftw_270a:
ldr r10,[r0,#scalex] @ src and delta
mov r12,#0 @ dest y
ldr r5,[r0,#srcheight]
ldr r6,[r0,#srcpitch]
sub r5,r5,#1
mul r7,r5,r6 @ start at "bottom left" of src
ldr r6,[r0,#srcptr]
add r3,r6,r7 @ now r3 points to bottom left of src
bltftw_270_topa:
mov r6,r3 @ source ptr
ldr r2,[r0,#destpitch]
ldr r5,[r0,#leftdestptr]
ldr r7,[r0,#srcpitch]
rsb r7,r7,#0 @ negative pitch
mul r8,r2,r12 @ point to start of dest line
add r5,r5,r8 @ R5 points to start of dest line
bic r5,r5,#3 @ we have to be on a DWORD boundary
ldr r8,[r0,#scaley]
ldr r11,[r0,#destheight]
adds r4,r10,r10,LSL #16 @ see if this line and the next will be the same
bcs bltftw_270_loop0a @ no, we have to touch that memory
ldr r4,[r0,#destwidth]
sub r4,r4,r12 @ see if we have at least 1 line left
cmp r4,#1
ble bltftw_270_loop0a @ no, last line
@ we can do it a little faster by drawing both
@ current and next line at the same time
add r2,r5,r2 @ point R2 to the next line
bltftw_270_loop1a:
ldrh r9,[r6] @ first source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
ldrh r14,[r6] @ second source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
orr r4,r9,r14,LSL #16 @ combine 2 pixels
ldrh r9,[r6] @ third source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
ldrh r14,[r6] @ fourth source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
pld [r6,r7] @ preload next line
subs r11,r11,#4 @ decrement destination count
orr r9,r9,r14,LSL #16 @ combine 2 pixels
stmia r5!,{r4,r9}
stmia r2!,{r4,r9} @ and the line below
bgt bltftw_270_loop1a
add r12,r12,#1 @ we did an extra line
adds r10,r10,r10,LSL #16 @ update y sum for the extra line
b bltftw_270n @ next line
@ have to draw a single line
bltftw_270_loop0a: @ inner loop
ldrh r9,[r6] @ first source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
ldrh r14,[r6] @ second source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
orr r4,r9,r14,LSL #16 @ combine 2 pixels
ldrh r9,[r6] @ third source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
ldrh r14,[r6] @ fourth source pixel
adds r8,r8,r8,LSL #16 @ src y sum
addcs r6,r6,r7 @ "inc" src y
pld [r6,r7] @ preload next line
subs r11,r11,#4 @ decrement destination count
orr r9,r9,r14,LSL #16 @ combine 2 pixels
stmia r5!,{r4,r9}
bgt bltftw_270_loop0a
@ Advance fractional src x and whole dest y
bltftw_270n: @ next line
adds r10,r10,r10,LSL #16 @ src xsum += xscale
addcs r3,r3,#2 @ advance src 1 pixel "right"
ldr r4,[r0,#destwidth]
add r12,r12,#1 @ inc dest y
cmp r12,r4 @ done with whole image?
bne bltftw_270_topa
bltftw_done:
ldmia sp!,{r4-r12,pc}
@
@ Draw an image at 75% scale with pixel averaging
@ call from C as ASMBLT75S(s, sPitch, d, dPitch, Width, Height)@
@ 8x8 blocks of pixels turn into 6x6 blocks
@
ASMBLT75S:
stmfd sp!,{r4-r12,lr}
mov r14,#0xf700 @ prepare averaging mask
orr r14,r14,#0xde
orr r14,r14,r14,LSL #16
mov r14,r14,LSR #1 @ prepare for mask and add
ldr r9,[sp,#40] @ get the source width
ldr r11,[sp,#44] @ get height
mov r10,r9 @ keep width in r10
add r12,r9,r9,LSL #1 @ width*3
mov r12,r12,LSR #1 @ (width*3/4) * sizeof (short)
@ first line
blt75_top0:
ldmia r0!,{r5-r8} @ read 8 pixels (which become 6)
subs r10,r10,#8 @ line count
and r6,r14,r6,LSR #1 @ second pair gets averaged
add r6,r6,r6,LSR #16 @ average the 2 pixels together
mov r6,r6,LSL #16 @ blast upper 16 bits
mov r6,r6,LSR #16
orr r6,r6,r7,LSL #16 @ get 5th pixel unchanged
mov r7,r7,LSR #16 @ get 6th pixel in 5th position
and r8,r14,r8,LSR #1 @ average the 7th and 8th pixels
add r8,r8,r8,LSR #16
mov r8,r8,LSL #16 @ put it in upper pixel position
orr r7,r7,r8 @ now r7 has the new pixels 5 and 6
stmia r2!,{r5-r7} @ store the 6 new pixels
bne blt75_top0
add r0,r0,r1 @ point to next source line
sub r0,r0,r9,LSL #1 @ move back amount we read through
add r2,r2,r3 @ point to next dest line
sub r2,r2,r12 @ move back amount we wrote
@ second line
mov r10,r9 @ width
blt75_top1:
ldmia r0!,{r5-r8} @ read 8 pixels (which become 6)
subs r10,r10,#8 @ line count
and r6,r14,r6,LSR #1 @ second pair gets averaged
add r6,r6,r6,LSR #16 @ average the 2 pixels together
mov r6,r6,LSL #16 @ blast upper 16 bits
mov r6,r6,LSR #16
orr r6,r6,r7,LSL #16 @ get 5th pixel unchanged
mov r7,r7,LSR #16 @ get 6th pixel in 5th position
and r8,r14,r8,LSR #1 @ average the 7th and 8th pixels
add r8,r8,r8,LSR #16
mov r8,r8,LSL #16 @ put it in upper pixel position
orr r7,r7,r8 @ now r7 has the new pixels 5 and 6
stmia r2!,{r5-r7} @ store the 6 new pixels
bne blt75_top1
add r0,r0,r1 @ point to next source line
sub r0,r0,r9,LSL #1 @ move back amount we read through
add r2,r2,r3 @ point to next dest line
sub r2,r2,r12 @ move back amount we wrote
@ third line
mov r10,r9 @ width
blt75_top2:
subs r10,r10,#8 @ decrement x count
ldr r6,[r0,r1] @ first pair below
ldr r5,[r0],#4 @ first pair
and r6,r14,r6,LSR #1 @ average the 2 together
and r5,r14,r5,LSR #1 @ average these as well
add r4,r5,r6 @ average top pair to bottom pair
ldr r6,[r0,r1] @ second pair below
ldr r5,[r0],#4 @ second pair
and r6,r14,r6,LSR #1 @ average this pair together
add r6,r6,r6,LSR #16
and r5,r14,r5,LSR #1 @ and this pair
add r5,r5,r5,LSR #16
and r5,r14,r5,LSR #1 @ average again
and r6,r14,r6,LSR #1
add r5,r5,r6 @ now 4 pixels averaged
mov r5,r5,LSL #16 @ clear top 16 bits
mov r5,r5,LSR #16
ldr r7,[r0,r1] @ third pair below
ldr r6,[r0],#4 @ third pair
and r7,r14,r7,LSR #1 @ average top to bottom
and r6,r14,r6,LSR #1
add r6,r6,r7 @ top pair averaged to bottom pair
orr r5,r5,r6,LSL #16 @ combine this pair
ldr r8,[r0,r1] @ fourth pair below
ldr r7,[r0],#4 @ fourth pair
and r8,r14,r8,LSR #1 @ average left/right of each pair
and r7,r14,r7,LSR #1
add r7,r7,r7,LSR #16
add r8,r8,r8,LSR #16
and r7,r14,r7,LSR #1 @ average top to bottom
and r8,r14,r8,LSR #1
add r7,r7,r8
mov r6,r6,LSR #16 @ get fifth pixel in position
orr r6,r6,r7,LSL #16 @ combine last pair
stmia r2!,{r4-r6} @ store 3 pairs to dest
bne blt75_top2
sub r0,r0,r9,LSL #1 @ move back to start of this line
add r0,r0,r1,LSL #1 @ move down 2 source lines
add r2,r2,r3 @ move down to next dest line
sub r2,r2,r12 @ move back amount we wrote
mov r10,r9 @ keep width in r10
subs r11,r11,#4 @ decrement height
bne blt75_top0
ldmfd sp!,{r4-r12,pc} @ restore regs and return
@
@ Draw an image at 270 degrees rotated
@ call from C as ASMBLT100_270(s, SrcPitch, d, DstPitch, cx, cy)@
@
ASMBLT100_270:
stmfd sp!,{r4-r12,r14}
ldr r12,[sp,#40] @ get width
ldr r14,[sp,#44] @ get height
add r0,r0,r12,LSL #1 @ point to last 2 columns
sub r0,r0,#4
asm100_270_top:
mov r8,r14 @ vertical count
mov r9,r0 @ starting pointer
asm100_270a:
ldr r4,[r9] @ get 2 pixels
ldr r5,[r9,r1] @ get 2 pixels below
add r9,r9,r1,LSL #1 @ skip down 2 lines
mov r6,r4,LSR #16
mov r7,r5,LSR #16
orr r6,r6,r7,LSL #16
mov r4,r4,LSL #16
mov r5,r5,LSL #16
orr r5,r5,r4,LSR #16
str r6,[r2]
str r5,[r2,r3]
add r2,r2,#4
subs r8,r8,#2
bne asm100_270a
sub r2,r2,r14,LSL #1 @ back to start of dest line
add r2,r2,r3,LSL #1 @ skip 2 lines
sub r0,r0,#4 @ back 2 pixels in source
subs r12,r12,#2 @ horizontal count
bne asm100_270_top
ldmfd sp!,{r4-r12,pc} @ restore regs
@
@ Draw an image at 90 degrees rotated
@ call from C as ASMBLT100_90(s, SrcPitch, d, DstPitch, cx, cy)@
@
ASMBLT100_90:
stmfd sp!,{r4-r12,r14}
ldr r12,[sp,#40] @ get width
ldr r14,[sp,#44] @ get height
mul r4,r1,r14 @ point to bottom of source
sub r4,r4,r1,LSL #1 @ -2 lines
add r0,r0,r4 @ now r0 points to bottom left
asm100_90_top:
mov r8,r14 @ vertical count
mov r9,r0 @ starting pointer
asm100_90a:
ldr r4,[r9] @ get 2 pixels
ldr r5,[r9,r1] @ get 2 pixels below
sub r9,r9,r1,LSL #1 @ skip up 2 lines
mov r6,r4,LSL #16
mov r7,r5,LSL #16
orr r6,r6,r7,LSR #16
mov r4,r4,LSR #16
mov r5,r5,LSR #16
orr r5,r5,r4,LSL #16
str r6,[r2]
str r5,[r2,r3]
add r2,r2,#4
subs r8,r8,#2
bne asm100_90a
sub r2,r2,r14,LSL #1 @ back to start of dest line
add r2,r2,r3,LSL #1 @ skip 2 lines
add r0,r0,#4 @ advance 2 pixels in source
subs r12,r12,#2 @ horizontal count
bne asm100_90_top
ldmfd sp!,{r4-r12,pc} @ restore regs
@
@ Draw one line of tile data (opaque)
@ call from C as ARMDrawTileOpaque(unsigned long ulFlipColor, unsigned long ulLeft, unsigned long ulRight, unsigned char *pDest)
@
ARMDrawTileOpaque:
tst r0,#0x4000 @ flipx?
and r0,r0,#0xff @ isolate color value
orr r0,r0,r0,LSL #8
orr r0,r0,r0,LSL #16 @ put color in all 4 bytes
orr r1,r1,r0 @ add color to pixels
orr r2,r2,r0
bne tile_opaque_flipx
tst r3,#3 @ dword-aligned destination?
stmeqia r3!,{r1,r2} @ store all 8 pixels
moveq pc,lr @ leave
@ store one pixel at a time
tst r3,#1 @ if it's an even address, we can use that
bne tile_opaque_bad @ worst case
strh r1,[r3] @ first pair of pixels
mov r1,r1,LSR #16
strh r1,[r3,#2] @ second pair
strh r2,[r3,#4] @ third pair
mov r2,r2,LSR #16
strh r2,[r3,#6] @ third pair
mov pc,lr
@ worst case
tile_opaque_bad:
strb r1,[r3]
mov r1,r1,LSR #8
strb r1,[r3,#1]
mov r1,r1,LSR #8
strb r1,[r3,#2]
mov r1,r1,LSR #8
strb r1,[r3,#3]
strb r2,[r3,#4]
mov r2,r2,LSR #8
strb r2,[r3,#5]
mov r2,r2,LSR #8
strb r2,[r3,#6]
mov r2,r2,LSR #8
strb r2,[r3,#7]
mov pc,lr
@ also a worst case (horizontally flipped)
tile_opaque_flipx:
strb r2,[r3,#3]
mov r2,r2,LSR #8
strb r2,[r3,#2]
mov r2,r2,LSR #8
strb r2,[r3,#1]
mov r2,r2,LSR #8
strb r2,[r3,#0]
strb r1,[r3,#7]
mov r1,r1,LSR #8
strb r1,[r3,#6]
mov r1,r1,LSR #8
strb r1,[r3,#5]
mov r1,r1,LSR #8
strb r1,[r3,#4]
mov pc,lr
@
@ Draw one line of tile data
@ call from C as ARMDrawTile(unsigned long ulFlipColor, unsigned long ulLeft, unsigned long ulRight, unsigned char *pDest)
@
ARMDrawTile:
tst r0,#0x4000 @ flipx?
and r0,r0,#0xff @ isolate color value
bne tile_flipx
orrs r1,r1,r1 @ first 4 pixels not transparent?
beq tile_d0 @ skip these 4
tst r1,#0xff @ anything to draw?
orrne r12,r0,r1 @ prepare first pixel
strneb r12,[r3]
tst r1,#0xff00 @ second pixel?
orrne r12,r0,r1,LSR #8
strneb r12,[r3,#1]
tst r1,#0xff0000 @ third pixel?
orrne r12,r0,r1,LSR #16
strneb r12,[r3,#2]
tst r1,#0xff000000 @ forth pixel?
orrne r1,r0,r1,LSR #24
strneb r1,[r3,#3]
tile_d0:
orrs r2,r2,r2 @ second 4 pixels not transparent?
moveq pc,lr @ time to go
tst r2,#0xff @ anything to draw?
orrne r12,r0,r2 @ prepare first pixel
strneb r12,[r3,#4]
tst r2,#0xff00 @ second pixel?
orrne r12,r0,r2,LSR #8
strneb r12,[r3,#5]
tst r2,#0xff0000 @ third pixel?
orrne r12,r0,r2,LSR #16
strneb r12,[r3,#6]
tst r2,#0xff000000 @ forth pixel?
orrne r12,r0,r2,LSR #24
strneb r12,[r3,#7]
mov pc,lr
tile_flipx:
orrs r2,r2,r2 @ first 4 pixels not transparent?
beq tile_fd0
tst r2,#0xff @ anything to draw?
orrne r12,r0,r2 @ prepare first pixel
strneb r12,[r3,#3]
tst r2,#0xff00
orrne r12,r0,r2,LSR #8
strneb r12,[r3,#2]
tst r2,#0xff0000
orrne r12,r0,r2,LSR #16
strneb r12,[r3,#1]
tst r2,#0xff000000
orrne r12,r0,r2,LSR #24
strneb r12,[r3,#0]
tile_fd0:
orrs r1,r1,r1 @ first 4 pixels not transparent?
moveq pc,lr @ we can return here
tst r1,#0xff @ anything to draw?
orrne r12,r0,r1 @ prepare first pixel
strneb r12,[r3,#7]
tst r1,#0xff00
orrne r12,r0,r1,LSR #8
strneb r12,[r3,#6]
tst r1,#0xff0000
orrne r12,r0,r1,LSR #16
strneb r12,[r3,#5]
tst r1,#0xff000000
orrne r12,r0,r1,LSR #24
strneb r12,[r3,#4]
mov pc,lr
@
@ Draw one line of sprite data
@ call from C as SNESDrawSprite(psrc, pdest, color, flipx)
@
SNESDrawSprite:
orrs r3,r3,r3 @ flipx?
bne sprite_flipx
mov r3,r1 @ get dest addr in r3
ldmia r0,{r0,r1} @ get 8 source pixels
orrs r0,r0,r0 @ first 4 pixels not transparent?
beq sprite_d0
orr r0,r0,r2 @ add in the color
tst r0,#0xf
strneb r0,[r3,#0]
mov r0,r0,LSR #8
tst r0,#0xf
strneb r0,[r3,#1]
mov r0,r0,LSR #8
tst r0,#0xf
strneb r0,[r3,#2]
mov r0,r0,LSR #8
tst r0,#0xf
strneb r0,[r3,#3]
sprite_d0:
orrs r1,r1,r1 @ second 4 pixels not transparent?
beq sprite_d1
orr r1,r1,r2 @ add in the color
tst r1,#0xf
strneb r1,[r3,#4]
mov r1,r1,LSR #8
tst r1,#0xf
strneb r1,[r3,#5]
mov r1,r1,LSR #8
tst r1,#0xf
strneb r1,[r3,#6]
mov r1,r1,LSR #8
tst r1,#0xf
strneb r1,[r3,#7]
sprite_d1:
mov pc,lr
sprite_flipx:
mov r3,r1 @ get dest addr in r3
ldmia r0,{r0,r1} @ get 8 source pixels
orrs r1,r1,r1 @ second 4 pixels not transparent?
beq sprite_fd0
orr r1,r1,r2 @ add in the color
tst r1,#0xf
strneb r1,[r3,#3]
mov r1,r1,LSR #8
tst r1,#0xf
strneb r1,[r3,#2]
mov r1,r1,LSR #8
tst r1,#0xf
strneb r1,[r3,#1]
mov r1,r1,LSR #8
tst r1,#0xf
strneb r1,[r3,#0]
sprite_fd0:
orrs r0,r0,r0
beq sprite_fd1
orr r0,r0,r2 @ add in the color
tst r0,#0xf
strneb r0,[r3,#7]
mov r0,r0,LSR #8
tst r0,#0xf
strneb r0,[r3,#6]
mov r0,r0,LSR #8
tst r0,#0xf
strneb r0,[r3,#5]
mov r0,r0,LSR #8
tst r0,#0xf
strneb r0,[r3,#4]
sprite_fd1:
mov pc,lr
@
@ Stretch a 16bpp bitmap 2x
@ call from C as ARMDraw2X(unsigned short *s, unsigned short *d, int iSrcPitch, int iDstPitch, int iWidth, int iHeight)@
@
ARMDraw2X:
stmfd sp!,{r4-r12,lr}
ldr r9,[sp,#40] @ get the source width
ldr r11,[sp,#44] @ get height
draw2x_top:
mov r10,r9,LSR #1 @ keep width in r10
@ draw the same line twice to prevent
@ cache misses on the writes