links/fbcommon.inc

581 lines
13 KiB
C++

/* n is in bytes. dest must begin on pixel boundary. If n is not a whole number
* of pixels, rounding is performed downwards.
* if bmpixelsize is 1, no alignment is required.
* if bmpixelsize is 2, dest must be aligned to 2 bytes.
* if bmpixelsize is 3, no alignment is required.
* if bmpixelsize is 4, dest must be aligned to 4 bytes.
* -- The following do not occur, this is only for forward compatibility.
* if bmpixelsize is 5, no alignment is required.
* if bmpixelsize is 6, dest must be aligned to 2 bytes.
* if bmpixelsize is 7, no alignment is required.
* if bmpixelsize is 8, dest must be aligned to 8 bytes.
*/
static inline void pixel_set(unsigned char *dest, int n, void *pattern)
{
switch (fb_pixelsize) {
case 1: {
memset(dest, *(unsigned char *)pattern, n);
break;
}
case 2: {
#ifdef t2c
t2c v = *(t2c *)memory_barrier(pattern);
/* ICC has an optimization bug here */
icc_volatile int a;
if ((v & 255) == ((v >> 8) & 255)) {
memset(dest, v, n);
} else {
#if defined(t8c) && !(defined(HAVE_GCC_ASSEMBLER) && defined(__i386__))
t8c vvvv = ((t8c)v << 48) | ((t8c)v << 32) | ((t8c)v << 16) | v;
#elif defined(t4c)
t4c vv = ((t4c)v << 16) | v;
#endif
a = n >> 1;
while (a) {
#if defined(t8c) && !(defined(HAVE_GCC_ASSEMBLER) && defined(__i386__))
if (!((unsigned long)dest & 7) && a >= 4) {
#if defined(HAVE_GCC_ASSEMBLER) && defined(__x86_64__)
int tmp;
__asm__ volatile ("rep stosq" : "=D"(dest), "=c"(tmp) : "D"(dest), "c"(a >> 2), "a"(vvvv) : "memory");
a &= 3;
#else
do {
*((t8c *)dest) = vvvv;
dest += 8;
a -= 4;
} while (a >= 4);
#endif
} else
#elif defined(t4c)
if (!((unsigned long)dest & 3) && a >= 2) {
#if defined(HAVE_GCC_ASSEMBLER) && defined(__i386__)
int tmp;
__asm__ volatile ("cld; rep stosl" : "=D"(dest), "=c"(tmp) : "D"(dest), "c"(a >> 1), "a"(vv) : "cc", "memory");
a &= 1;
#else
do {
*((t4c *)dest) = vv;
dest += 4;
a -= 2;
} while (a >= 2);
#endif
} else
#endif
{
*((t2c *)dest) = v;
dest += 2;
a--;
}
}
}
#else
unsigned char a, b;
int i;
a = *(unsigned char*)pattern;
b = ((unsigned char*)pattern)[1];
if (a == b) memset(dest, a, n);
else for (i = 0; i <= n - 2; i += 2) {
dest[i] = a;
dest[i+1] = b;
}
#endif
break;
}
case 3: {
unsigned char a, b, c;
a = *(unsigned char*)pattern;
b = ((unsigned char*)pattern)[1];
c = ((unsigned char*)pattern)[2];
if (a == b && b == c) {
memset(dest, a, n);
} else {
#if defined(t4c)
#if defined(t8c_is_efficient)
t8c t;
if (!big_endian) {
t = a | (b << 8) | (c << 16);
} else {
t = b | (a << 8) | (c << 16);
}
t |= (t << 24) | (t << 48);
#else
t4c t;
if (!big_endian) {
t = a | (b << 8) | (c << 16) | (a << 24);
} else {
t = a | (c << 8) | (b << 16) | (a << 24);
}
#endif
while (n) {
#if defined(t8c_is_efficient)
if (!((unsigned long)dest & 7) && n >= 8) {
do {
*((t8c *)dest) = t;
dest += 8;
n -= 8;
if (!big_endian) {
t = (t << 8) | (t >> 16);
} else {
t = (t >> 8) | (t << 16);
}
} while (n >= 8);
} else
#else
if (!((unsigned long)dest & 3) && n >= 4) {
do {
*((t4c *)dest) = t;
dest += 4;
n -= 4;
if (!big_endian) {
t = (t >> 8) | (t << 16);
} else {
t = (t << 8) | (t >> 16);
}
} while (n >= 4);
} else
#endif
{
if (!big_endian) {
*dest++ = (unsigned char)t;
t = (t >> 8) | (t << 16);
} else {
*dest++ = (unsigned char)(t
#if defined(t8c_is_efficient)
>> 8
#endif
);
t = (t << 8) | (t >> 16);
}
n--;
}
}
#else
int i;
for (i = 0; i <= n - 3; i += 3) {
dest[i] = a;
dest[i + 1] = b;
dest[i + 2] = c;
}
#endif
}
break;
}
case 4: {
if (((unsigned char *)pattern)[1] == ((unsigned char *)pattern)[2] &&
((unsigned char *)pattern)[1] == ((unsigned char *)pattern)[drv->depth & ~255 ? 3 : 0]) {
memset(dest, ((unsigned char *)pattern)[1], n);
} else {
#if defined(HAVE_GCC_ASSEMBLER) && defined(__i386__)
unsigned v = *(unsigned *)memory_barrier(pattern);
int tmp;
__asm__ volatile ("cld; rep stosl" : "=D"(dest), "=c"(tmp) : "D"(dest), "c"(n >> 2), "a"(v) : "cc", "memory");
#elif defined(t4c)
t4c v=*(t4c *)memory_barrier(pattern);
/* ICC has an optimization bug here */
icc_volatile int a;
{
#ifdef t8c
t8c vv = ((t8c)v << 32) | v;
#endif
a = n >> 2;
while (a) {
#ifdef t8c
if (!((unsigned long)dest & 7) && a >= 2) {
#if defined(HAVE_GCC_ASSEMBLER) && defined(__x86_64__)
int tmp;
__asm__ volatile ("rep stosq" : "=D"(dest), "=c"(tmp) : "D"(dest), "c"(a >> 1), "a"(vv) : "memory");
a &= 1;
#else
do {
*((t8c *)dest) = vv;
dest += 8;
a -= 2;
} while (a >= 2);
#endif
} else
#endif
{
*(t4c *)dest = v;
dest += 4;
a--;
}
}
}
#else
unsigned char a, b, c, d;
int i;
a = *(unsigned char*)pattern;
b = ((unsigned char*)pattern)[1];
c = ((unsigned char*)pattern)[2];
d = ((unsigned char*)pattern)[3];
for (i = 0; i <= n - 4; i += 4) {
dest[i] = a;
dest[i + 1] = b;
dest[i + 2] = c;
dest[i + 3] = d;
}
#endif
}
break;
}
#if 0
default: {
int a;
for (a = 0; a < n / fb_pixelsize; a++, dest += fb_pixelsize) memcpy(dest, pattern, fb_pixelsize);
}
break;
#endif
}
}
static inline void memcpy_to_fb_inline(unsigned char *dest, unsigned char *src, size_t len, int from_fb)
{
#ifdef HAVE_GCC_ASSEMBLER
#if defined(__i386__)
#define memcpy_to_fb_implemented
/* memcpy in glibc 2.17 has half the theoretical througput */
size_t tmp;
__asm__ volatile (" \n\
cld \n\
testw $3, %%di \n\
jz 1f \n\
testw $1, %%di \n\
jz 2f \n"
#ifdef __TINYC__
" testl %%ecx, %%ecx \n\
je 9f \n"
#else
" jecxz 9f \n"
#endif
" movsb \n\
decl %%ecx \n\
testw $2, %%di \n\
jz 1f \n\
2: movl %%ecx, %0 \n\
subl $2, %%ecx \n\
jb 4f \n\
movsw \n\
1: movl %%ecx, %0 \n\
shrl $2, %%ecx \n\
rep \n\
movsl \n\
testb $3, %b0 \n\
jz 9f \n\
testb $2, %b0 \n\
jz 3f \n\
movsw \n\
4: testb $1, %b0 \n\
jz 9f \n\
3: movsb \n\
9: " : "=q"(tmp), "=D"(dest), "=S"(src), "=c"(len) : "D"(dest), "S"(src), "c"(len) : "cc", "memory");
return;
#endif
#if defined(__x86_64__) && !defined(__TINYC__)
#define memcpy_to_fb_implemented
if (from_fb) {
size_t l;
unsigned char buffer[16];
static int have_sse41 = -1;
if (have_sse41 < 0) {
unsigned tmp_eax, ecx;
__asm__ ("pushq %%rbx; pushq %%rdx; cpuid; popq %%rdx; popq %%rbx" : "=a"(tmp_eax), "=c"(ecx) : "a"(1));
have_sse41 = (ecx >> 19) & 1;
}
if (!have_sse41)
goto use_memcpy;
l = -(size_t)src & 0xf;
if (l) {
__asm__ volatile ("movntdqa %1, %%xmm0; movdqu %%xmm0, %0" : "=m"(buffer) : "m"(*(src - 0x10 + l)) : "xmm0", "memory");
memcpy(dest, buffer + 0x10 - l, l < len ? l : len);
if (l >= len)
return;
dest += l;
src += l;
len -= l;
}
__asm__ volatile (" \n\
jmp 2f \n\
.p2align 4 \n\
1: movntdqa (%1), %%xmm0 \n\
add $16, %1 \n\
movdqu %%xmm0, (%0) \n\
add $16, %0 \n\
2: sub $16, %2 \n\
jae 1b \n\
" : "=r"(dest), "=r"(src), "=r"(len) : "0"(dest), "1"(src), "2"(len) : "xmm0", "memory", "cc");
l = len & 0xf;
if (l) {
__asm__ volatile ("movntdqa %1, %%xmm0; movdqu %%xmm0, %0" : "=m"(buffer) : "m"(*src) : "xmm0", "memory");
memcpy(dest, buffer, l);
}
return;
}
use_memcpy:
#endif
#if defined(__aarch64__) && defined(__ARM_FEATURE_UNALIGNED)
#define memcpy_to_fb_implemented
/*
* This is not faster than the glibc memcpy, but there's some problem
* with data corruption in the PCIe controller with unaligned writes
*/
unsigned tmp32;
unsigned long tmp1, tmp2, tmp3, tmp4;
__asm__ volatile (" \n\
tbz %0, #0, 1f \n\
subs %2, %2, #1 \n\
b.cc 10f \n\
ldrb %w3, [ %1 ], #1 \n\
strb %w3, [ %0 ], #1 \n\
1: \n\
tbz %0, #1, 2f \n\
subs %2, %2, #2 \n\
b.cc 9f \n\
ldrh %w3, [ %1 ], #2 \n\
strh %w3, [ %0 ], #2 \n\
2: \n\
tbz %0, #2, 3f \n\
subs %2, %2, #4 \n\
b.cc 8f \n\
ldr %w3, [ %1 ], #4 \n\
str %w3, [ %0 ], #4 \n\
3: \n\
tbz %0, #3, 4f \n\
subs %2, %2, #8 \n\
b.cc 7f \n\
ldr %4, [ %1 ], #8 \n\
str %4, [ %0 ], #8 \n\
4: \n\
subs %2, %2, #16 \n\
b.cc 6f \n\
\n\
tbnz %2, #4, 55f \n\
ldp %4, %5, [ %1 ], #16 \n\
stp %4, %5, [ %0 ], #16 \n\
subs %2, %2, #16 \n\
b.cc 6f \n\
55: \n\
add %0, %0, #16 \n\
add %1, %1, #16 \n\
.p2align 3 \n\
5: \n\
ldp %4, %5, [ %1, #-16 ] \n\
ldp %6, %7, [ %1 ], #32 \n\
subs %2, %2, #32 \n\
stp %4, %5, [ %0, #-16 ] \n\
stp %6, %7, [ %0 ], #32 \n\
b.cs 5b \n\
sub %0, %0, #16 \n\
sub %1, %1, #16 \n\
6: \n\
tbz %2, #3, 7f \n\
ldr %4, [ %1 ], #8 \n\
str %4, [ %0 ], #8 \n\
7: \n\
tbz %2, #2, 8f \n\
ldr %w3, [ %1 ], #4 \n\
str %w3, [ %0 ], #4 \n\
8: \n\
tbz %2, #1, 9f \n\
ldrh %w3, [ %1 ], #2 \n\
strh %w3, [ %0 ], #2 \n\
9: \n\
tbz %2, #0, 10f \n\
ldrb %w3, [ %1 ] \n\
strb %w3, [ %0 ] \n\
10: \n\
" : "=r"(dest), "=r"(src), "=r"(len), "=r"(tmp32), "=r"(tmp1), "=r"(tmp2), "=r"(tmp3), "=r"(tmp4) : "0"(dest), "1"(src), "2"(len) : "cc", "memory");
return;
#endif
#if defined(__alpha__)
#define memcpy_to_fb_implemented
/*
* The glibc memcpy is very slow because it reads the same value
* from the framebuffer multiple times.
* There's no point in unrolling the loops because performance is
* limited by the bus.
*/
unsigned long tmp1, tmp2, tmp3, tmp4;
__asm__ volatile (" \n\
amask 1, $22 \n\
subq $31, %0, %3 \n\
beq %2, 6f \n\
addq %0, %2, %4 \n\
and %3, 7, %3 \n\
addq %1, %3, %6 \n\
cmpult %2, %3, %5 \n\
beq $22, 20f \n\
and %4, 3, $23 \n\
or %5, $23, $23 \n\
beq $23, 7f \n\
ldq_u $25, -1(%4) \n\
7: beq %3, 1f \n\
blbs %5, 12f \n\
and %3, 3, $24 \n\
beq $24, 8f \n\
ldq_u %4, 0(%0) \n\
8: ldq_u %5, 0(%1) \n\
subq %2, %3, %2 \n\
ldq_u %6, -1(%6) \n\
extql %5, %1, %5 \n\
extqh %6, %1, %6 \n\
addq %1, %3, %1 \n\
or %5, %6, %5 \n\
beq $24, 9f \n\
insql %5, %0, %5 \n\
mskql %4, %0, %4 \n\
or %5, %4, %4 \n\
stq_u %4, 0(%0) \n\
br 10f \n\
9: stl %5, 0(%0) \n\
10: addq %0, %3, %0 \n\
\n\
1: subq %2, 8, %2 \n\
blt %2, 4f \n\
and %1, 7, %4 \n\
bne %4, 2f \n\
.p2align 3 \n\
3: ldq %3, 0(%1) \n\
addq %1, 8, %1 \n\
subq %2, 8, %2 \n\
stq %3, 0(%0) \n\
addq %0, 8, %0 \n\
bge %2, 3b \n\
br 4f \n\
\n\
2: ldq_u %5, 0(%1) \n\
.p2align 3 \n\
5: ldq_u %4, 8(%1) \n\
extql %5, %1, %5 \n\
extqh %4, %1, %6 \n\
subq %2, 8, %2 \n\
or %5, %6, %5 \n\
addq %1, 8, %1 \n\
stq %5, 0(%0) \n\
mov %4, %5 \n\
addq %0, 8, %0 \n\
bge %2, 5b \n\
\n\
4: and %2, 7, %2 \n\
12: beq %2, 6f \n\
addq %1, %2, %6 \n\
ldq_u $24, 0(%1) \n\
ldq_u %6, -1(%6) \n\
extql $24, %1, $24 \n\
extqh %6, %1, %6 \n\
or $24, %6, $24 \n\
beq $22, 24f \n\
beq $23, 11f \n\
mskql $24, %2, $24 \n\
addq %0, %2, %3 \n\
insql $24, %0, $24 \n\
mskql $25, %0, %6 \n\
mskqh $25, %3, $25 \n\
or $25, %6, $25 \n\
or $25, $24, $24 \n\
stq_u $24, 0(%0) \n\
br 6f \n\
\n\
11: stl $24, 0(%0) \n\
br 6f \n\
\n\
20: beq %3, 1b \n\
addq %1, %2, %4 \n\
cmovlbs %5, %4, %6 \n\
ldq_u $24, 0(%1) \n\
mov %0, $23 \n\
ldq_u $25, -1(%6) \n\
extql $24, %1, $24 \n\
extqh $25, %1, $25 \n\
addq %0, %3, %0 \n\
or $24, $25, $24 \n\
addq %1, %3, %1 \n\
blbc $23, 21f \n\
/*stb $24, 0($23)*/ \n\
.long 0x3b170000 \n\
addq $23, 1, $23 \n\
subq %2, 1, %2 \n\
srl $24, 8, $24 \n\
21: and $23, 2, %4 \n\
beq %4, 22f \n\
subq %2, 2, %2 \n\
blt %2, 26f \n\
/*stw $24, 0($23)*/ \n\
.long 0x37170000 \n\
addq $23, 2, $23 \n\
srl $24, 16, $24 \n\
22: and $23, 4, %4 \n\
beq %4, 1b \n\
subq %2, 4, %2 \n\
blt %2, 25f \n\
stl $24, 0($23) \n\
23: br 1b \n\
\n\
24: and %2, 4, %4 \n\
mov %0, $23 \n\
beq %4, 25f \n\
stl $24, 0(%0) \n\
addq $23, 4, $23 \n\
srl $24, 32, $24 \n\
25: and %2, 2, %4 \n\
beq %4, 26f \n\
/*stw $24, 0($23)*/ \n\
.long 0x37170000 \n\
addq $23, 2, $23 \n\
srl $24, 16, $24 \n\
26: blbc %2, 6f \n\
/*stb $24, 0($23)*/ \n\
.long 0x3b170000 \n\
\n\
6: \n\
" : "=r"(dest), "=r"(src), "=r"(len), "=r"(tmp1), "=r"(tmp2), "=r"(tmp3), "=r"(tmp4) : "0"(dest), "1"(src), "2"(len) : "22", "23", "24", "25", "cc", "memory");
return;
#endif
#endif
memcpy(dest, src, len);
}
static
#ifdef memcpy_to_fb_implemented
ATTR_NOINLINE
#else
inline
#endif
void memcpy_to_fb(unsigned char *dest, unsigned char *src, size_t len, int from_fb)
{
memcpy_to_fb_inline(dest, src, len, from_fb);
}
static void memmove_in_fb(unsigned char *dest, unsigned char *src, size_t len)
{
#ifdef memcpy_to_fb_implemented
static unsigned char *buffer = NULL;
static size_t buffer_len = 0;
if (!len)
return;
if (len > buffer_len) {
unsigned char *new_buffer = malloc(len);
if (!new_buffer)
goto fallback_to_memmove;
if (buffer)
free(buffer);
buffer = new_buffer;
}
memcpy_to_fb(buffer, src, len, 1);
memcpy_to_fb(dest, buffer, len, 0);
return;
fallback_to_memmove:
#endif
memmove(dest, src, len);
}