mirror of
https://salsa.debian.org/xorg-team/lib/pixman
synced 2025-08-29 06:03:17 +00:00
MIPS: DSPr2: Added over_n_8888_8888_ca and over_n_8888_0565_ca fast paths.
Performance numbers before/after on MIPS-74kc @ 1GHz Referent (before): lowlevel-blt-bench: over_n_8888_8888_ca = L1: 8.32 L2: 7.65 M: 6.38 ( 51.08%) HT: 5.78 VT: 5.74 R: 5.84 RT: 4.39 ( 37Kops/s) over_n_8888_0565_ca = L1: 7.40 L2: 6.95 M: 6.16 ( 41.06%) HT: 5.72 VT: 5.52 R: 5.63 RT: 4.28 ( 36Kops/s) cairo-perf-trace: [ # ] backend test min(s) median(s) stddev. count [ # ] image: pixman 0.25.3 [ 0] image xfce4-terminal-a1 138.223 139.070 0.33% 6/6 [ # ] image16: pixman 0.25.3 [ 0] image16 xfce4-terminal-a1 132.763 132.939 0.06% 5/6 Optimized: lowlevel-blt-bench: over_n_8888_8888_ca = L1: 19.35 L2: 23.84 M: 13.68 (109.39%) HT: 11.39 VT: 11.19 R: 11.27 RT: 6.90 ( 47Kops/s) over_n_8888_0565_ca = L1: 18.68 L2: 17.00 M: 12.56 ( 83.70%) HT: 10.72 VT: 10.45 R: 10.43 RT: 5.79 ( 43Kops/s) cairo-perf-trace: [ # ] backend test min(s) median(s) stddev. count [ # ] image: pixman 0.25.3 [ 0] image xfce4-terminal-a1 130.400 131.720 0.46% 6/6 [ # ] image16: pixman 0.25.3 [ 0] image16 xfce4-terminal-a1 125.830 126.604 0.34% 6/6
This commit is contained in:
parent
a069da6c66
commit
d2ee5631ae
@ -308,3 +308,222 @@ LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips)
|
||||
nop
|
||||
|
||||
END(pixman_composite_src_x888_8888_asm_mips)
|
||||
|
||||
LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
|
||||
/*
|
||||
* a0 - dst (a8r8g8b8)
|
||||
* a1 - src (32bit constant)
|
||||
* a2 - mask (a8r8g8b8)
|
||||
* a3 - w
|
||||
*/
|
||||
|
||||
SAVE_REGS_ON_STACK 16, s0, s1, s2, s3, s4, s5, s6, s7
|
||||
beqz a3, 4f
|
||||
nop
|
||||
li t6, 0xff
|
||||
addiu t7, zero, -1 /* t7 = 0xffffffff */
|
||||
srl t8, a1, 24 /* t8 = srca */
|
||||
li t9, 0x00ff00ff
|
||||
addiu t1, a3, -1
|
||||
beqz t1, 3f /* last pixel */
|
||||
nop
|
||||
beq t8, t6, 2f /* if (srca == 0xff) */
|
||||
nop
|
||||
1:
|
||||
/* a1 = src */
|
||||
lw t0, 0(a2) /* t0 = mask */
|
||||
lw t1, 4(a2) /* t1 = mask */
|
||||
or t2, t0, t1
|
||||
beqz t2, 12f /* if (t0 == 0) && (t1 == 0) */
|
||||
addiu a2, a2, 8
|
||||
and t3, t0, t1
|
||||
move s0, t8 /* s0 = srca */
|
||||
move s1, t8 /* s1 = srca */
|
||||
move t4, a1 /* t4 = src */
|
||||
move t5, a1 /* t5 = src */
|
||||
lw t2, 0(a0) /* t2 = dst */
|
||||
beq t3, t7, 11f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
|
||||
lw t3, 4(a0) /* t0 = dst */
|
||||
MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
|
||||
MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t8, t8, s0, s1, t9, s2, s3, s4, s5, s6, s7
|
||||
11:
|
||||
not s0, s0
|
||||
not s1, s1
|
||||
MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, s0, s1, s2, s3, t9, t0, t1, s4, s5, s6, s7
|
||||
addu_s.qb t0, t4, s2
|
||||
addu_s.qb t1, t5, s3
|
||||
sw t0, 0(a0)
|
||||
sw t1, 4(a0)
|
||||
12:
|
||||
addiu a3, a3, -2
|
||||
addiu t1, a3, -1
|
||||
bgtz t1, 1b
|
||||
addiu a0, a0, 8
|
||||
b 3f
|
||||
nop
|
||||
2:
|
||||
/* a1 = src */
|
||||
lw t0, 0(a2) /* t0 = mask */
|
||||
lw t1, 4(a2) /* t1 = mask */
|
||||
or t2, t0, t1
|
||||
beqz t2, 22f /* if (t0 == 0) & (t1 == 0) */
|
||||
addiu a2, a2, 8
|
||||
and t2, t0, t1
|
||||
move s0, a1
|
||||
beq t2, t7, 21f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
|
||||
move s1, a1
|
||||
lw t2, 0(a0) /* t2 = dst */
|
||||
lw t3, 4(a0) /* t3 = dst */
|
||||
MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
|
||||
not t0, t0
|
||||
not t1, t1
|
||||
MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, s0, s1, t9, s2, s3, s4, s5, s6, s7
|
||||
addu_s.qb s0, t4, s0
|
||||
addu_s.qb s1, t5, s1
|
||||
21:
|
||||
sw s0, 0(a0)
|
||||
sw s1, 4(a0)
|
||||
22:
|
||||
addiu a3, a3, -2
|
||||
addiu t1, a3, -1
|
||||
bgtz t1, 2b
|
||||
addiu a0, a0, 8
|
||||
3:
|
||||
blez a3, 4f
|
||||
nop
|
||||
/* a1 = src */
|
||||
lw t1, 0(a2) /* t1 = mask */
|
||||
beqz t1, 4f
|
||||
nop
|
||||
move s0, t8 /* s0 = srca */
|
||||
move t2, a1 /* t2 = src */
|
||||
beq t1, t7, 31f
|
||||
lw t0, 0(a0) /* t0 = dst */
|
||||
|
||||
MIPS_UN8x4_MUL_UN8x4 a1, t1, t2, t9, t3, t4, t5, t6
|
||||
MIPS_UN8x4_MUL_UN8 t1, t8, s0, t9, t3, t4, t5
|
||||
31:
|
||||
not s0, s0
|
||||
MIPS_UN8x4_MUL_UN8x4 t0, s0, t3, t9, t4, t5, t6, t1
|
||||
addu_s.qb t0, t2, t3
|
||||
sw t0, 0(a0)
|
||||
4:
|
||||
RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3, s4, s5, s6, s7
|
||||
j ra
|
||||
nop
|
||||
|
||||
END(pixman_composite_over_n_8888_8888_ca_asm_mips)
|
||||
|
||||
LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
|
||||
/*
|
||||
* a0 - dst (r5g6b5)
|
||||
* a1 - src (32bit constant)
|
||||
* a2 - mask (a8r8g8b8)
|
||||
* a3 - w
|
||||
*/
|
||||
|
||||
SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
|
||||
beqz a3, 4f
|
||||
nop
|
||||
li t5, 0xf800f800
|
||||
li t6, 0x07e007e0
|
||||
li t7, 0x001F001F
|
||||
li t9, 0x00ff00ff
|
||||
|
||||
srl t8, a1, 24 /* t8 = srca */
|
||||
addiu t1, a3, -1
|
||||
beqz t1, 3f /* last pixel */
|
||||
nop
|
||||
li s0, 0xff /* s0 = 0xff */
|
||||
addiu s1, zero, -1 /* s1 = 0xffffffff */
|
||||
|
||||
beq t8, s0, 2f /* if (srca == 0xff) */
|
||||
nop
|
||||
1:
|
||||
/* a1 = src */
|
||||
lw t0, 0(a2) /* t0 = mask */
|
||||
lw t1, 4(a2) /* t1 = mask */
|
||||
or t2, t0, t1
|
||||
beqz t2, 12f /* if (t0 == 0) && (t1 == 0) */
|
||||
addiu a2, a2, 8
|
||||
and t3, t0, t1
|
||||
move t0, t8
|
||||
move t1, a1
|
||||
lhu t2, 0(a0) /* t2 = dst */
|
||||
beq t3, s1, 11f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
|
||||
lhu t3, 2(a0) /* t3 = dst */
|
||||
MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
|
||||
MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t8, t8, t0, t1, t9, t4, s4, s5, s6, s7, s8
|
||||
11:
|
||||
not t0, t0
|
||||
not t1, t1
|
||||
CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
|
||||
MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t0, t1
|
||||
addu_s.qb s2, s2, s4
|
||||
addu_s.qb s3, s3, s5
|
||||
CONVERT_2x8888_TO_2x0565 s2, s3, t2, t3, t5, t6, t7, s1, s2
|
||||
sh t2, 0(a0)
|
||||
sh t3, 2(a0)
|
||||
12:
|
||||
addiu a3, a3, -2
|
||||
addiu t1, a3, -1
|
||||
bgtz t1, 1b
|
||||
addiu a0, a0, 4
|
||||
b 3f
|
||||
nop
|
||||
2:
|
||||
/* a1 = src */
|
||||
lw t0, 0(a2) /* t0 = mask */
|
||||
lw t1, 4(a2) /* t1 = mask */
|
||||
or t2, t0, t1
|
||||
beqz t2, 22f /* if (t0 == 0) & (t1 == 0) */
|
||||
addiu a2, a2, 8
|
||||
and t3, t0, t1
|
||||
move t2, a1
|
||||
beq t3, s1, 21f /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
|
||||
move t3, a1
|
||||
lhu t2, 0(a0) /* t2 = dst */
|
||||
lhu t3, 2(a0) /* t3 = dst */
|
||||
MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
|
||||
not t0, t0
|
||||
not t1, t1
|
||||
CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
|
||||
MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t2, t3
|
||||
addu_s.qb t2, s2, s4
|
||||
addu_s.qb t3, s3, s5
|
||||
21:
|
||||
CONVERT_2x8888_TO_2x0565 t2, t3, t0, t1, t5, t6, t7, s2, s3
|
||||
sh t0, 0(a0)
|
||||
sh t1, 2(a0)
|
||||
22:
|
||||
addiu a3, a3, -2
|
||||
addiu t1, a3, -1
|
||||
bgtz t1, 2b
|
||||
addiu a0, a0, 4
|
||||
3:
|
||||
blez a3, 4f
|
||||
nop
|
||||
/* a1 = src */
|
||||
lw t1, 0(a2) /* t1 = mask */
|
||||
beqz t1, 4f
|
||||
nop
|
||||
move s0, t8 /* s0 = srca */
|
||||
move t2, a1 /* t2 = src */
|
||||
beq t1, t7, 31f
|
||||
lhu t0, 0(a0) /* t0 = dst */
|
||||
|
||||
MIPS_UN8x4_MUL_UN8x4 a1, t1, t2, t9, t3, t4, t5, t6
|
||||
MIPS_UN8x4_MUL_UN8 t1, t8, s0, t9, t3, t4, t5
|
||||
31:
|
||||
not s0, s0
|
||||
CONVERT_1x0565_TO_1x8888 t0, s1, s2, s3
|
||||
MIPS_UN8x4_MUL_UN8x4 s1, s0, t3, t9, t4, t5, t6, t1
|
||||
addu_s.qb t0, t2, t3
|
||||
CONVERT_1x8888_TO_1x0565 t0, s1, s2, s3
|
||||
sh s1, 0(a0)
|
||||
4:
|
||||
RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
|
||||
j ra
|
||||
nop
|
||||
|
||||
END(pixman_composite_over_n_8888_0565_ca_asm_mips)
|
||||
|
@ -95,6 +95,170 @@ LEAF_MIPS32R2(symbol) \
|
||||
.end function; \
|
||||
.size function,.-function
|
||||
|
||||
/*
|
||||
* Checks if stack offset is big enough for storing/restoring regs_num
|
||||
* number of register to/from stack. Stack offset must be greater than
|
||||
* or equal to the number of bytes needed for storing registers (regs_num*4).
|
||||
* Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
|
||||
* preserved for input arguments of the functions, already stored in a0-a3),
|
||||
* stack size can be further optimized by utilizing this space.
|
||||
*/
|
||||
.macro CHECK_STACK_OFFSET regs_num, stack_offset
|
||||
.if \stack_offset < \regs_num * 4 - 16
|
||||
.error "Stack offset too small."
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Saves set of registers on stack. Maximum number of registers that
|
||||
* can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
|
||||
* Stack offset is number of bytes that are added to stack pointer (sp)
|
||||
* before registers are pushed in order to provide enough space on stack
|
||||
* (offset must be multiple of 4, and must be big enough, as described by
|
||||
* CHECK_STACK_OFFSET macro). This macro is intended to be used in
|
||||
* combination with RESTORE_REGS_FROM_STACK macro. Example:
|
||||
* SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
|
||||
* RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
|
||||
*/
|
||||
.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
|
||||
r2 = 0, r3 = 0, r4 = 0, \
|
||||
r5 = 0, r6 = 0, r7 = 0, \
|
||||
r8 = 0, r9 = 0, r10 = 0, \
|
||||
r11 = 0, r12 = 0, r13 = 0, \
|
||||
r14 = 0
|
||||
.if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
|
||||
.error "Stack offset must be pozitive and multiple of 4."
|
||||
.endif
|
||||
.if \stack_offset != 0
|
||||
addiu sp, sp, -\stack_offset
|
||||
.endif
|
||||
sw \r1, 0(sp)
|
||||
.if \r2 != 0
|
||||
sw \r2, 4(sp)
|
||||
.endif
|
||||
.if \r3 != 0
|
||||
sw \r3, 8(sp)
|
||||
.endif
|
||||
.if \r4 != 0
|
||||
sw \r4, 12(sp)
|
||||
.endif
|
||||
.if \r5 != 0
|
||||
CHECK_STACK_OFFSET 5, \stack_offset
|
||||
sw \r5, 16(sp)
|
||||
.endif
|
||||
.if \r6 != 0
|
||||
CHECK_STACK_OFFSET 6, \stack_offset
|
||||
sw \r6, 20(sp)
|
||||
.endif
|
||||
.if \r7 != 0
|
||||
CHECK_STACK_OFFSET 7, \stack_offset
|
||||
sw \r7, 24(sp)
|
||||
.endif
|
||||
.if \r8 != 0
|
||||
CHECK_STACK_OFFSET 8, \stack_offset
|
||||
sw \r8, 28(sp)
|
||||
.endif
|
||||
.if \r9 != 0
|
||||
CHECK_STACK_OFFSET 9, \stack_offset
|
||||
sw \r9, 32(sp)
|
||||
.endif
|
||||
.if \r10 != 0
|
||||
CHECK_STACK_OFFSET 10, \stack_offset
|
||||
sw \r10, 36(sp)
|
||||
.endif
|
||||
.if \r11 != 0
|
||||
CHECK_STACK_OFFSET 11, \stack_offset
|
||||
sw \r11, 40(sp)
|
||||
.endif
|
||||
.if \r12 != 0
|
||||
CHECK_STACK_OFFSET 12, \stack_offset
|
||||
sw \r12, 44(sp)
|
||||
.endif
|
||||
.if \r13 != 0
|
||||
CHECK_STACK_OFFSET 13, \stack_offset
|
||||
sw \r13, 48(sp)
|
||||
.endif
|
||||
.if \r14 != 0
|
||||
CHECK_STACK_OFFSET 14, \stack_offset
|
||||
sw \r14, 52(sp)
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Restores set of registers from stack. Maximum number of registers that
|
||||
* can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
|
||||
* Stack offset is number of bytes that are added to stack pointer (sp)
|
||||
* after registers are restored (offset must be multiple of 4, and must
|
||||
* be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
|
||||
* intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
|
||||
* Example:
|
||||
* SAVE_REGS_ON_STACK 4, v0, v1, s0, s1
|
||||
* RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
|
||||
*/
|
||||
.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
|
||||
r2 = 0, r3 = 0, r4 = 0, \
|
||||
r5 = 0, r6 = 0, r7 = 0, \
|
||||
r8 = 0, r9 = 0, r10 = 0, \
|
||||
r11 = 0, r12 = 0, r13 = 0, \
|
||||
r14 = 0
|
||||
.if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
|
||||
.error "Stack offset must be pozitive and multiple of 4."
|
||||
.endif
|
||||
lw \r1, 0(sp)
|
||||
.if \r2 != 0
|
||||
lw \r2, 4(sp)
|
||||
.endif
|
||||
.if \r3 != 0
|
||||
lw \r3, 8(sp)
|
||||
.endif
|
||||
.if \r4 != 0
|
||||
lw \r4, 12(sp)
|
||||
.endif
|
||||
.if \r5 != 0
|
||||
CHECK_STACK_OFFSET 5, \stack_offset
|
||||
lw \r5, 16(sp)
|
||||
.endif
|
||||
.if \r6 != 0
|
||||
CHECK_STACK_OFFSET 6, \stack_offset
|
||||
lw \r6, 20(sp)
|
||||
.endif
|
||||
.if \r7 != 0
|
||||
CHECK_STACK_OFFSET 7, \stack_offset
|
||||
lw \r7, 24(sp)
|
||||
.endif
|
||||
.if \r8 != 0
|
||||
CHECK_STACK_OFFSET 8, \stack_offset
|
||||
lw \r8, 28(sp)
|
||||
.endif
|
||||
.if \r9 != 0
|
||||
CHECK_STACK_OFFSET 9, \stack_offset
|
||||
lw \r9, 32(sp)
|
||||
.endif
|
||||
.if \r10 != 0
|
||||
CHECK_STACK_OFFSET 10, \stack_offset
|
||||
lw \r10, 36(sp)
|
||||
.endif
|
||||
.if \r11 != 0
|
||||
CHECK_STACK_OFFSET 11, \stack_offset
|
||||
lw \r11, 40(sp)
|
||||
.endif
|
||||
.if \r12 != 0
|
||||
CHECK_STACK_OFFSET 12, \stack_offset
|
||||
lw \r12, 44(sp)
|
||||
.endif
|
||||
.if \r13 != 0
|
||||
CHECK_STACK_OFFSET 13, \stack_offset
|
||||
lw \r13, 48(sp)
|
||||
.endif
|
||||
.if \r14 != 0
|
||||
CHECK_STACK_OFFSET 14, \stack_offset
|
||||
lw \r14, 52(sp)
|
||||
.endif
|
||||
.if \stack_offset != 0
|
||||
addiu sp, sp, \stack_offset
|
||||
.endif
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel
|
||||
* returned in (out_8888) register. Requires two temporary registers
|
||||
@ -203,4 +367,136 @@ LEAF_MIPS32R2(symbol) \
|
||||
srl \out2_565, \out1_565, 16
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Multiply pixel (a8) with single pixel (a8r8g8b8). It requires maskLSR needed
|
||||
* for rounding process. maskLSR must have following value:
|
||||
* li maskLSR, 0x00ff00ff
|
||||
*/
|
||||
.macro MIPS_UN8x4_MUL_UN8 s_8888, \
|
||||
m_8, \
|
||||
d_8888, \
|
||||
maskLSR, \
|
||||
scratch1, scratch2, scratch3
|
||||
replv.ph \m_8, \m_8 /* 0 | M | 0 | M */
|
||||
muleu_s.ph.qbl \scratch1, \s_8888, \m_8 /* A*M | R*M */
|
||||
muleu_s.ph.qbr \scratch2, \s_8888, \m_8 /* G*M | B*M */
|
||||
shra_r.ph \scratch3, \scratch1, 8
|
||||
shra_r.ph \d_8888, \scratch2, 8
|
||||
and \scratch3, \scratch3, \maskLSR /* 0 |A*M| 0 |R*M */
|
||||
and \d_8888, \d_8888, \maskLSR /* 0 |G*M| 0 |B*M */
|
||||
addq.ph \scratch1, \scratch1, \scratch3 /* A*M+A*M | R*M+R*M */
|
||||
addq.ph \scratch2, \scratch2, \d_8888 /* G*M+G*M | B*M+B*M */
|
||||
shra_r.ph \scratch1, \scratch1, 8
|
||||
shra_r.ph \scratch2, \scratch2, 8
|
||||
precr.qb.ph \d_8888, \scratch1, \scratch2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Multiply two pixels (a8) with two pixels (a8r8g8b8). It requires maskLSR
|
||||
* needed for rounding process. maskLSR must have following value:
|
||||
* li maskLSR, 0x00ff00ff
|
||||
*/
|
||||
.macro MIPS_2xUN8x4_MUL_2xUN8 s1_8888, \
|
||||
s2_8888, \
|
||||
m1_8, \
|
||||
m2_8, \
|
||||
d1_8888, \
|
||||
d2_8888, \
|
||||
maskLSR, \
|
||||
scratch1, scratch2, scratch3, \
|
||||
scratch4, scratch5, scratch6
|
||||
replv.ph \m1_8, \m1_8 /* 0 | M1 | 0 | M1 */
|
||||
replv.ph \m2_8, \m2_8 /* 0 | M2 | 0 | M2 */
|
||||
muleu_s.ph.qbl \scratch1, \s1_8888, \m1_8 /* A1*M1 | R1*M1 */
|
||||
muleu_s.ph.qbr \scratch2, \s1_8888, \m1_8 /* G1*M1 | B1*M1 */
|
||||
muleu_s.ph.qbl \scratch3, \s2_8888, \m2_8 /* A2*M2 | R2*M2 */
|
||||
muleu_s.ph.qbr \scratch4, \s2_8888, \m2_8 /* G2*M2 | B2*M2 */
|
||||
shra_r.ph \scratch5, \scratch1, 8
|
||||
shra_r.ph \d1_8888, \scratch2, 8
|
||||
shra_r.ph \scratch6, \scratch3, 8
|
||||
shra_r.ph \d2_8888, \scratch4, 8
|
||||
and \scratch5, \scratch5, \maskLSR /* 0 |A1*M1| 0 |R1*M1 */
|
||||
and \d1_8888, \d1_8888, \maskLSR /* 0 |G1*M1| 0 |B1*M1 */
|
||||
and \scratch6, \scratch6, \maskLSR /* 0 |A2*M2| 0 |R2*M2 */
|
||||
and \d2_8888, \d2_8888, \maskLSR /* 0 |G2*M2| 0 |B2*M2 */
|
||||
addq.ph \scratch1, \scratch1, \scratch5
|
||||
addq.ph \scratch2, \scratch2, \d1_8888
|
||||
addq.ph \scratch3, \scratch3, \scratch6
|
||||
addq.ph \scratch4, \scratch4, \d2_8888
|
||||
shra_r.ph \scratch1, \scratch1, 8
|
||||
shra_r.ph \scratch2, \scratch2, 8
|
||||
shra_r.ph \scratch3, \scratch3, 8
|
||||
shra_r.ph \scratch4, \scratch4, 8
|
||||
precr.qb.ph \d1_8888, \scratch1, \scratch2
|
||||
precr.qb.ph \d2_8888, \scratch3, \scratch4
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Multiply pixel (a8r8g8b8) with single pixel (a8r8g8b8). It requires maskLSR
|
||||
* needed for rounding process. maskLSR must have following value:
|
||||
* li maskLSR, 0x00ff00ff
|
||||
*/
|
||||
.macro MIPS_UN8x4_MUL_UN8x4 s_8888, \
|
||||
m_8888, \
|
||||
d_8888, \
|
||||
maskLSR, \
|
||||
scratch1, scratch2, scratch3, scratch4
|
||||
preceu.ph.qbl \scratch1, \m_8888 /* 0 | A | 0 | R */
|
||||
preceu.ph.qbr \scratch2, \m_8888 /* 0 | G | 0 | B */
|
||||
muleu_s.ph.qbl \scratch3, \s_8888, \scratch1 /* A*A | R*R */
|
||||
muleu_s.ph.qbr \scratch4, \s_8888, \scratch2 /* G*G | B*B */
|
||||
shra_r.ph \scratch1, \scratch3, 8
|
||||
shra_r.ph \scratch2, \scratch4, 8
|
||||
and \scratch1, \scratch1, \maskLSR /* 0 |A*A| 0 |R*R */
|
||||
and \scratch2, \scratch2, \maskLSR /* 0 |G*G| 0 |B*B */
|
||||
addq.ph \scratch1, \scratch1, \scratch3
|
||||
addq.ph \scratch2, \scratch2, \scratch4
|
||||
shra_r.ph \scratch1, \scratch1, 8
|
||||
shra_r.ph \scratch2, \scratch2, 8
|
||||
precr.qb.ph \d_8888, \scratch1, \scratch2
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Multiply two pixels (a8r8g8b8) with two pixels (a8r8g8b8). It requires
|
||||
* maskLSR needed for rounding process. maskLSR must have following value:
|
||||
* li maskLSR, 0x00ff00ff
|
||||
*/
|
||||
|
||||
.macro MIPS_2xUN8x4_MUL_2xUN8x4 s1_8888, \
|
||||
s2_8888, \
|
||||
m1_8888, \
|
||||
m2_8888, \
|
||||
d1_8888, \
|
||||
d2_8888, \
|
||||
maskLSR, \
|
||||
scratch1, scratch2, scratch3, \
|
||||
scratch4, scratch5, scratch6
|
||||
preceu.ph.qbl \scratch1, \m1_8888 /* 0 | A | 0 | R */
|
||||
preceu.ph.qbr \scratch2, \m1_8888 /* 0 | G | 0 | B */
|
||||
preceu.ph.qbl \scratch3, \m2_8888 /* 0 | A | 0 | R */
|
||||
preceu.ph.qbr \scratch4, \m2_8888 /* 0 | G | 0 | B */
|
||||
muleu_s.ph.qbl \scratch5, \s1_8888, \scratch1 /* A*A | R*R */
|
||||
muleu_s.ph.qbr \scratch6, \s1_8888, \scratch2 /* G*G | B*B */
|
||||
muleu_s.ph.qbl \scratch1, \s2_8888, \scratch3 /* A*A | R*R */
|
||||
muleu_s.ph.qbr \scratch2, \s2_8888, \scratch4 /* G*G | B*B */
|
||||
shra_r.ph \scratch3, \scratch5, 8
|
||||
shra_r.ph \scratch4, \scratch6, 8
|
||||
shra_r.ph \d1_8888, \scratch1, 8
|
||||
shra_r.ph \d2_8888, \scratch2, 8
|
||||
and \scratch3, \scratch3, \maskLSR /* 0 |A*A| 0 |R*R */
|
||||
and \scratch4, \scratch4, \maskLSR /* 0 |G*G| 0 |B*B */
|
||||
and \d1_8888, \d1_8888, \maskLSR /* 0 |A*A| 0 |R*R */
|
||||
and \d2_8888, \d2_8888, \maskLSR /* 0 |G*G| 0 |B*B */
|
||||
addq.ph \scratch3, \scratch3, \scratch5
|
||||
addq.ph \scratch4, \scratch4, \scratch6
|
||||
addq.ph \d1_8888, \d1_8888, \scratch1
|
||||
addq.ph \d2_8888, \d2_8888, \scratch2
|
||||
shra_r.ph \scratch3, \scratch3, 8
|
||||
shra_r.ph \scratch4, \scratch4, 8
|
||||
shra_r.ph \scratch5, \d1_8888, 8
|
||||
shra_r.ph \scratch6, \d2_8888, 8
|
||||
precr.qb.ph \d1_8888, \scratch3, \scratch4
|
||||
precr.qb.ph \d2_8888, \scratch5, \scratch6
|
||||
.endm
|
||||
|
||||
#endif //PIXMAN_MIPS_DSPR2_ASM_H
|
||||
|
@ -49,6 +49,11 @@ PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
|
||||
PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
|
||||
uint8_t, 3, uint8_t, 3)
|
||||
|
||||
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
|
||||
uint32_t, 1, uint32_t, 1)
|
||||
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca,
|
||||
uint32_t, 1, uint16_t, 1)
|
||||
|
||||
static pixman_bool_t
|
||||
pixman_fill_mips (uint32_t *bits,
|
||||
int stride,
|
||||
@ -184,6 +189,13 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
|
||||
PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888),
|
||||
PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, mips_composite_src_0888_0888),
|
||||
|
||||
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mips_composite_over_n_8888_8888_ca),
|
||||
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mips_composite_over_n_8888_8888_ca),
|
||||
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mips_composite_over_n_8888_8888_ca),
|
||||
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca),
|
||||
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca),
|
||||
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca),
|
||||
|
||||
{ PIXMAN_OP_NONE },
|
||||
};
|
||||
|
||||
|
@ -85,4 +85,46 @@ mips_composite_##name (pixman_implementation_t *imp, \
|
||||
} \
|
||||
}
|
||||
|
||||
/*******************************************************************/
|
||||
|
||||
#define PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST(flags, name, \
|
||||
mask_type, mask_cnt, \
|
||||
dst_type, dst_cnt) \
|
||||
void \
|
||||
pixman_composite_##name##_asm_mips (dst_type *dst, \
|
||||
uint32_t src, \
|
||||
mask_type *mask, \
|
||||
int32_t w); \
|
||||
\
|
||||
static void \
|
||||
mips_composite_##name (pixman_implementation_t *imp, \
|
||||
pixman_composite_info_t *info) \
|
||||
{ \
|
||||
PIXMAN_COMPOSITE_ARGS (info); \
|
||||
dst_type *dst_line, *dst; \
|
||||
mask_type *mask_line, *mask; \
|
||||
int32_t dst_stride, mask_stride; \
|
||||
uint32_t src; \
|
||||
\
|
||||
src = _pixman_image_get_solid ( \
|
||||
imp, src_image, dest_image->bits.format); \
|
||||
\
|
||||
if ((flags & SKIP_ZERO_SRC) && src == 0) \
|
||||
return; \
|
||||
\
|
||||
PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \
|
||||
dst_stride, dst_line, dst_cnt); \
|
||||
PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type, \
|
||||
mask_stride, mask_line, mask_cnt); \
|
||||
\
|
||||
while (height--) \
|
||||
{ \
|
||||
dst = dst_line; \
|
||||
dst_line += dst_stride; \
|
||||
mask = mask_line; \
|
||||
mask_line += mask_stride; \
|
||||
pixman_composite_##name##_asm_mips (dst, src, mask, width); \
|
||||
} \
|
||||
}
|
||||
|
||||
#endif //PIXMAN_MIPS_DSPR2_H
|
||||
|
Loading…
Reference in New Issue
Block a user