mirror of
https://salsa.debian.org/xorg-team/lib/pixman
synced 2025-09-06 12:25:16 +00:00

The performance improvement is only in the ballpark of 5% when compared against C code built with a reasonably good compiler (gcc 4.5.1). But gcc 4.4 produces approximately 30% slower code here, so assembly optimization makes sense to avoid dependency on the compiler quality and/or optimization options. Benchmark from ARM11: == before == op=1, src_fmt=10020565, dst_fmt=10020565, speed=34.86 MPix/s == after == op=1, src_fmt=10020565, dst_fmt=10020565, speed=36.62 MPix/s Benchmark from ARM Cortex-A8: == before == op=1, src_fmt=10020565, dst_fmt=10020565, speed=89.55 MPix/s == after == op=1, src_fmt=10020565, dst_fmt=10020565, speed=94.91 MPix/s
425 lines
12 KiB
C
425 lines
12 KiB
C
/*
|
|
* Copyright © 2008 Mozilla Corporation
|
|
*
|
|
* Permission to use, copy, modify, distribute, and sell this software and its
|
|
* documentation for any purpose is hereby granted without fee, provided that
|
|
* the above copyright notice appear in all copies and that both that
|
|
* copyright notice and this permission notice appear in supporting
|
|
* documentation, and that the name of Mozilla Corporation not be used in
|
|
* advertising or publicity pertaining to distribution of the software without
|
|
* specific, written prior permission. Mozilla Corporation makes no
|
|
* representations about the suitability of this software for any purpose. It
|
|
* is provided "as is" without express or implied warranty.
|
|
*
|
|
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
|
|
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
|
|
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
|
|
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
|
|
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
|
|
* SOFTWARE.
|
|
*
|
|
* Author: Jeff Muizelaar (jeff@infidigm.net)
|
|
*
|
|
*/
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include "pixman-private.h"
|
|
#include "pixman-arm-common.h"
|
|
#include "pixman-fast-path.h"
|
|
|
|
#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
|
|
|
|
void
|
|
pixman_composite_add_8_8_asm_armv6 (int32_t width,
|
|
int32_t height,
|
|
uint8_t *dst_line,
|
|
int32_t dst_stride,
|
|
uint8_t *src_line,
|
|
int32_t src_stride)
|
|
{
|
|
uint8_t *dst, *src;
|
|
int32_t w;
|
|
uint8_t s, d;
|
|
|
|
while (height--)
|
|
{
|
|
dst = dst_line;
|
|
dst_line += dst_stride;
|
|
src = src_line;
|
|
src_line += src_stride;
|
|
w = width;
|
|
|
|
/* ensure both src and dst are properly aligned before doing 32 bit reads
|
|
* we'll stay in this loop if src and dst have differing alignments
|
|
*/
|
|
while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
|
|
{
|
|
s = *src;
|
|
d = *dst;
|
|
asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
|
|
*dst = d;
|
|
|
|
dst++;
|
|
src++;
|
|
w--;
|
|
}
|
|
|
|
while (w >= 4)
|
|
{
|
|
asm ("uqadd8 %0, %1, %2"
|
|
: "=r" (*(uint32_t*)dst)
|
|
: "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
|
|
dst += 4;
|
|
src += 4;
|
|
w -= 4;
|
|
}
|
|
|
|
while (w)
|
|
{
|
|
s = *src;
|
|
d = *dst;
|
|
asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
|
|
*dst = d;
|
|
|
|
dst++;
|
|
src++;
|
|
w--;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
void
|
|
pixman_composite_over_8888_8888_asm_armv6 (int32_t width,
|
|
int32_t height,
|
|
uint32_t *dst_line,
|
|
int32_t dst_stride,
|
|
uint32_t *src_line,
|
|
int32_t src_stride)
|
|
{
|
|
uint32_t *dst;
|
|
uint32_t *src;
|
|
int32_t w;
|
|
uint32_t component_half = 0x800080;
|
|
uint32_t upper_component_mask = 0xff00ff00;
|
|
uint32_t alpha_mask = 0xff;
|
|
|
|
while (height--)
|
|
{
|
|
dst = dst_line;
|
|
dst_line += dst_stride;
|
|
src = src_line;
|
|
src_line += src_stride;
|
|
w = width;
|
|
|
|
/* #define inner_branch */
|
|
asm volatile (
|
|
"cmp %[w], #0\n\t"
|
|
"beq 2f\n\t"
|
|
"1:\n\t"
|
|
/* load src */
|
|
"ldr r5, [%[src]], #4\n\t"
|
|
#ifdef inner_branch
|
|
/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
|
|
* The 0x0 case also allows us to avoid doing an unecessary data
|
|
* write which is more valuable so we only check for that
|
|
*/
|
|
"cmp r5, #0\n\t"
|
|
"beq 3f\n\t"
|
|
|
|
/* = 255 - alpha */
|
|
"sub r8, %[alpha_mask], r5, lsr #24\n\t"
|
|
|
|
"ldr r4, [%[dest]] \n\t"
|
|
|
|
#else
|
|
"ldr r4, [%[dest]] \n\t"
|
|
|
|
/* = 255 - alpha */
|
|
"sub r8, %[alpha_mask], r5, lsr #24\n\t"
|
|
#endif
|
|
"uxtb16 r6, r4\n\t"
|
|
"uxtb16 r7, r4, ror #8\n\t"
|
|
|
|
/* multiply by 257 and divide by 65536 */
|
|
"mla r6, r6, r8, %[component_half]\n\t"
|
|
"mla r7, r7, r8, %[component_half]\n\t"
|
|
|
|
"uxtab16 r6, r6, r6, ror #8\n\t"
|
|
"uxtab16 r7, r7, r7, ror #8\n\t"
|
|
|
|
/* recombine the 0xff00ff00 bytes of r6 and r7 */
|
|
"and r7, r7, %[upper_component_mask]\n\t"
|
|
"uxtab16 r6, r7, r6, ror #8\n\t"
|
|
|
|
"uqadd8 r5, r6, r5\n\t"
|
|
|
|
#ifdef inner_branch
|
|
"3:\n\t"
|
|
|
|
#endif
|
|
"str r5, [%[dest]], #4\n\t"
|
|
/* increment counter and jmp to top */
|
|
"subs %[w], %[w], #1\n\t"
|
|
"bne 1b\n\t"
|
|
"2:\n\t"
|
|
: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
|
|
: [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
|
|
[alpha_mask] "r" (alpha_mask)
|
|
: "r4", "r5", "r6", "r7", "r8", "cc", "memory"
|
|
);
|
|
}
|
|
}
|
|
|
|
void
|
|
pixman_composite_over_8888_n_8888_asm_armv6 (int32_t width,
|
|
int32_t height,
|
|
uint32_t *dst_line,
|
|
int32_t dst_stride,
|
|
uint32_t *src_line,
|
|
int32_t src_stride,
|
|
uint32_t mask)
|
|
{
|
|
uint32_t *dst;
|
|
uint32_t *src;
|
|
int32_t w;
|
|
uint32_t component_half = 0x800080;
|
|
uint32_t alpha_mask = 0xff;
|
|
|
|
mask = (mask) >> 24;
|
|
|
|
while (height--)
|
|
{
|
|
dst = dst_line;
|
|
dst_line += dst_stride;
|
|
src = src_line;
|
|
src_line += src_stride;
|
|
w = width;
|
|
|
|
/* #define inner_branch */
|
|
asm volatile (
|
|
"cmp %[w], #0\n\t"
|
|
"beq 2f\n\t"
|
|
"1:\n\t"
|
|
/* load src */
|
|
"ldr r5, [%[src]], #4\n\t"
|
|
#ifdef inner_branch
|
|
/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
|
|
* The 0x0 case also allows us to avoid doing an unecessary data
|
|
* write which is more valuable so we only check for that
|
|
*/
|
|
"cmp r5, #0\n\t"
|
|
"beq 3f\n\t"
|
|
|
|
#endif
|
|
"ldr r4, [%[dest]] \n\t"
|
|
|
|
"uxtb16 r6, r5\n\t"
|
|
"uxtb16 r7, r5, ror #8\n\t"
|
|
|
|
/* multiply by alpha (r8) then by 257 and divide by 65536 */
|
|
"mla r6, r6, %[mask_alpha], %[component_half]\n\t"
|
|
"mla r7, r7, %[mask_alpha], %[component_half]\n\t"
|
|
|
|
"uxtab16 r6, r6, r6, ror #8\n\t"
|
|
"uxtab16 r7, r7, r7, ror #8\n\t"
|
|
|
|
"uxtb16 r6, r6, ror #8\n\t"
|
|
"uxtb16 r7, r7, ror #8\n\t"
|
|
|
|
/* recombine */
|
|
"orr r5, r6, r7, lsl #8\n\t"
|
|
|
|
"uxtb16 r6, r4\n\t"
|
|
"uxtb16 r7, r4, ror #8\n\t"
|
|
|
|
/* 255 - alpha */
|
|
"sub r8, %[alpha_mask], r5, lsr #24\n\t"
|
|
|
|
/* multiply by alpha (r8) then by 257 and divide by 65536 */
|
|
"mla r6, r6, r8, %[component_half]\n\t"
|
|
"mla r7, r7, r8, %[component_half]\n\t"
|
|
|
|
"uxtab16 r6, r6, r6, ror #8\n\t"
|
|
"uxtab16 r7, r7, r7, ror #8\n\t"
|
|
|
|
"uxtb16 r6, r6, ror #8\n\t"
|
|
"uxtb16 r7, r7, ror #8\n\t"
|
|
|
|
/* recombine */
|
|
"orr r6, r6, r7, lsl #8\n\t"
|
|
|
|
"uqadd8 r5, r6, r5\n\t"
|
|
|
|
#ifdef inner_branch
|
|
"3:\n\t"
|
|
|
|
#endif
|
|
"str r5, [%[dest]], #4\n\t"
|
|
/* increment counter and jmp to top */
|
|
"subs %[w], %[w], #1\n\t"
|
|
"bne 1b\n\t"
|
|
"2:\n\t"
|
|
: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
|
|
: [component_half] "r" (component_half), [mask_alpha] "r" (mask),
|
|
[alpha_mask] "r" (alpha_mask)
|
|
: "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
|
|
);
|
|
}
|
|
}
|
|
|
|
void
|
|
pixman_composite_over_n_8_8888_asm_armv6 (int32_t width,
|
|
int32_t height,
|
|
uint32_t *dst_line,
|
|
int32_t dst_stride,
|
|
uint32_t src,
|
|
int32_t unused,
|
|
uint8_t *mask_line,
|
|
int32_t mask_stride)
|
|
{
|
|
uint32_t srca;
|
|
uint32_t *dst;
|
|
uint8_t *mask;
|
|
int32_t w;
|
|
|
|
srca = src >> 24;
|
|
|
|
uint32_t component_mask = 0xff00ff;
|
|
uint32_t component_half = 0x800080;
|
|
|
|
uint32_t src_hi = (src >> 8) & component_mask;
|
|
uint32_t src_lo = src & component_mask;
|
|
|
|
while (height--)
|
|
{
|
|
dst = dst_line;
|
|
dst_line += dst_stride;
|
|
mask = mask_line;
|
|
mask_line += mask_stride;
|
|
w = width;
|
|
|
|
/* #define inner_branch */
|
|
asm volatile (
|
|
"cmp %[w], #0\n\t"
|
|
"beq 2f\n\t"
|
|
"1:\n\t"
|
|
/* load mask */
|
|
"ldrb r5, [%[mask]], #1\n\t"
|
|
#ifdef inner_branch
|
|
/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
|
|
* The 0x0 case also allows us to avoid doing an unecessary data
|
|
* write which is more valuable so we only check for that
|
|
*/
|
|
"cmp r5, #0\n\t"
|
|
"beq 3f\n\t"
|
|
|
|
#endif
|
|
"ldr r4, [%[dest]] \n\t"
|
|
|
|
/* multiply by alpha (r8) then by 257 and divide by 65536 */
|
|
"mla r6, %[src_lo], r5, %[component_half]\n\t"
|
|
"mla r7, %[src_hi], r5, %[component_half]\n\t"
|
|
|
|
"uxtab16 r6, r6, r6, ror #8\n\t"
|
|
"uxtab16 r7, r7, r7, ror #8\n\t"
|
|
|
|
"uxtb16 r6, r6, ror #8\n\t"
|
|
"uxtb16 r7, r7, ror #8\n\t"
|
|
|
|
/* recombine */
|
|
"orr r5, r6, r7, lsl #8\n\t"
|
|
|
|
"uxtb16 r6, r4\n\t"
|
|
"uxtb16 r7, r4, ror #8\n\t"
|
|
|
|
/* we could simplify this to use 'sub' if we were
|
|
* willing to give up a register for alpha_mask
|
|
*/
|
|
"mvn r8, r5\n\t"
|
|
"mov r8, r8, lsr #24\n\t"
|
|
|
|
/* multiply by alpha (r8) then by 257 and divide by 65536 */
|
|
"mla r6, r6, r8, %[component_half]\n\t"
|
|
"mla r7, r7, r8, %[component_half]\n\t"
|
|
|
|
"uxtab16 r6, r6, r6, ror #8\n\t"
|
|
"uxtab16 r7, r7, r7, ror #8\n\t"
|
|
|
|
"uxtb16 r6, r6, ror #8\n\t"
|
|
"uxtb16 r7, r7, ror #8\n\t"
|
|
|
|
/* recombine */
|
|
"orr r6, r6, r7, lsl #8\n\t"
|
|
|
|
"uqadd8 r5, r6, r5\n\t"
|
|
|
|
#ifdef inner_branch
|
|
"3:\n\t"
|
|
|
|
#endif
|
|
"str r5, [%[dest]], #4\n\t"
|
|
/* increment counter and jmp to top */
|
|
"subs %[w], %[w], #1\n\t"
|
|
"bne 1b\n\t"
|
|
"2:\n\t"
|
|
: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
|
|
: [component_half] "r" (component_half),
|
|
[src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
|
|
: "r4", "r5", "r6", "r7", "r8", "cc", "memory");
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
|
|
uint8_t, 1, uint8_t, 1)
|
|
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
|
|
uint32_t, 1, uint32_t, 1)
|
|
|
|
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
|
|
uint32_t, 1, uint32_t, 1)
|
|
|
|
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
|
|
uint8_t, 1, uint32_t, 1)
|
|
|
|
PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
|
|
uint16_t, uint16_t)
|
|
|
|
static const pixman_fast_path_t arm_simd_fast_paths[] =
|
|
{
|
|
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
|
|
|
|
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
|
|
|
|
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
|
|
PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
|
|
|
|
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
|
|
PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
|
|
|
|
{ PIXMAN_OP_NONE },
|
|
};
|
|
|
|
pixman_implementation_t *
|
|
_pixman_implementation_create_arm_simd (void)
|
|
{
|
|
pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
|
|
pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths);
|
|
|
|
return imp;
|
|
}
|