580 lines
13 KiB
C++
580 lines
13 KiB
C++
/* n is in bytes. dest must begin on pixel boundary. If n is not a whole number
|
|
* of pixels, rounding is performed downwards.
|
|
* if bmpixelsize is 1, no alignment is required.
|
|
* if bmpixelsize is 2, dest must be aligned to 2 bytes.
|
|
* if bmpixelsize is 3, no alignment is required.
|
|
* if bmpixelsize is 4, dest must be aligned to 4 bytes.
|
|
* -- The following do not occur, this is only for forward compatibility.
|
|
* if bmpixelsize is 5, no alignment is required.
|
|
* if bmpixelsize is 6, dest must be aligned to 2 bytes.
|
|
* if bmpixelsize is 7, no alignment is required.
|
|
* if bmpixelsize is 8, dest must be aligned to 8 bytes.
|
|
*/
|
|
|
|
static inline void pixel_set(unsigned char *dest, int n, void *pattern)
|
|
{
|
|
switch (fb_pixelsize) {
|
|
case 1: {
|
|
memset(dest, *(unsigned char *)pattern, n);
|
|
break;
|
|
}
|
|
|
|
case 2: {
|
|
#ifdef t2c
|
|
t2c v = *(t2c *)memory_barrier(pattern);
|
|
/* ICC has an optimization bug here */
|
|
icc_volatile int a;
|
|
|
|
if ((v & 255) == ((v >> 8) & 255)) {
|
|
memset(dest, v, n);
|
|
} else {
|
|
#if defined(t8c) && !(defined(HAVE_GCC_ASSEMBLER) && defined(__i386__))
|
|
t8c vvvv = ((t8c)v << 48) | ((t8c)v << 32) | ((t8c)v << 16) | v;
|
|
#elif defined(t4c)
|
|
t4c vv = ((t4c)v << 16) | v;
|
|
#endif
|
|
a = n >> 1;
|
|
while (a) {
|
|
#if defined(t8c) && !(defined(HAVE_GCC_ASSEMBLER) && defined(__i386__))
|
|
if (!((unsigned long)dest & 7) && a >= 4) {
|
|
#if defined(HAVE_GCC_ASSEMBLER) && defined(__x86_64__)
|
|
int tmp;
|
|
__asm__ volatile ("rep stosq" : "=D"(dest), "=c"(tmp) : "D"(dest), "c"(a >> 2), "a"(vvvv) : "memory");
|
|
a &= 3;
|
|
#else
|
|
do {
|
|
*((t8c *)dest) = vvvv;
|
|
dest += 8;
|
|
a -= 4;
|
|
} while (a >= 4);
|
|
#endif
|
|
} else
|
|
#elif defined(t4c)
|
|
if (!((unsigned long)dest & 3) && a >= 2) {
|
|
#if defined(HAVE_GCC_ASSEMBLER) && defined(__i386__)
|
|
int tmp;
|
|
__asm__ volatile ("cld; rep stosl" : "=D"(dest), "=c"(tmp) : "D"(dest), "c"(a >> 1), "a"(vv) : "cc", "memory");
|
|
a &= 1;
|
|
#else
|
|
do {
|
|
*((t4c *)dest) = vv;
|
|
dest += 4;
|
|
a -= 2;
|
|
} while (a >= 2);
|
|
#endif
|
|
} else
|
|
#endif
|
|
{
|
|
*((t2c *)dest) = v;
|
|
dest += 2;
|
|
a--;
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
unsigned char a, b;
|
|
int i;
|
|
|
|
a = *(unsigned char*)pattern;
|
|
b = ((unsigned char*)pattern)[1];
|
|
if (a == b) memset(dest, a, n);
|
|
else for (i = 0; i <= n - 2; i += 2) {
|
|
dest[i] = a;
|
|
dest[i+1] = b;
|
|
}
|
|
#endif
|
|
break;
|
|
}
|
|
|
|
case 3: {
|
|
unsigned char a, b, c;
|
|
|
|
a = *(unsigned char*)pattern;
|
|
b = ((unsigned char*)pattern)[1];
|
|
c = ((unsigned char*)pattern)[2];
|
|
if (a == b && b == c) {
|
|
memset(dest, a, n);
|
|
} else {
|
|
#if defined(t4c)
|
|
#if defined(t8c_is_efficient)
|
|
t8c t;
|
|
if (!big_endian) {
|
|
t = a | (b << 8) | (c << 16);
|
|
} else {
|
|
t = b | (a << 8) | (c << 16);
|
|
}
|
|
t |= (t << 24) | (t << 48);
|
|
#else
|
|
t4c t;
|
|
if (!big_endian) {
|
|
t = a | (b << 8) | (c << 16) | (a << 24);
|
|
} else {
|
|
t = a | (c << 8) | (b << 16) | (a << 24);
|
|
}
|
|
#endif
|
|
while (n) {
|
|
#if defined(t8c_is_efficient)
|
|
if (!((unsigned long)dest & 7) && n >= 8) {
|
|
do {
|
|
*((t8c *)dest) = t;
|
|
dest += 8;
|
|
n -= 8;
|
|
if (!big_endian) {
|
|
t = (t << 8) | (t >> 16);
|
|
} else {
|
|
t = (t >> 8) | (t << 16);
|
|
}
|
|
} while (n >= 8);
|
|
} else
|
|
#else
|
|
if (!((unsigned long)dest & 3) && n >= 4) {
|
|
do {
|
|
*((t4c *)dest) = t;
|
|
dest += 4;
|
|
n -= 4;
|
|
if (!big_endian) {
|
|
t = (t >> 8) | (t << 16);
|
|
} else {
|
|
t = (t << 8) | (t >> 16);
|
|
}
|
|
} while (n >= 4);
|
|
} else
|
|
#endif
|
|
{
|
|
if (!big_endian) {
|
|
*dest++ = (unsigned char)t;
|
|
t = (t >> 8) | (t << 16);
|
|
} else {
|
|
*dest++ = (unsigned char)(t
|
|
#if defined(t8c_is_efficient)
|
|
>> 8
|
|
#endif
|
|
);
|
|
t = (t << 8) | (t >> 16);
|
|
}
|
|
n--;
|
|
}
|
|
}
|
|
#else
|
|
int i;
|
|
for (i = 0; i <= n - 3; i += 3) {
|
|
dest[i] = a;
|
|
dest[i + 1] = b;
|
|
dest[i + 2] = c;
|
|
}
|
|
#endif
|
|
}
|
|
break;
|
|
}
|
|
|
|
case 4: {
|
|
if (((unsigned char *)pattern)[1] == ((unsigned char *)pattern)[2] &&
|
|
((unsigned char *)pattern)[1] == ((unsigned char *)pattern)[drv->depth & ~255 ? 3 : 0]) {
|
|
memset(dest, ((unsigned char *)pattern)[1], n);
|
|
} else {
|
|
#if defined(HAVE_GCC_ASSEMBLER) && defined(__i386__)
|
|
unsigned v = *(unsigned *)memory_barrier(pattern);
|
|
int tmp;
|
|
__asm__ volatile ("cld; rep stosl" : "=D"(dest), "=c"(tmp) : "D"(dest), "c"(n >> 2), "a"(v) : "cc", "memory");
|
|
#elif defined(t4c)
|
|
t4c v=*(t4c *)memory_barrier(pattern);
|
|
/* ICC has an optimization bug here */
|
|
icc_volatile int a;
|
|
|
|
{
|
|
#ifdef t8c
|
|
t8c vv = ((t8c)v << 32) | v;
|
|
#endif
|
|
a = n >> 2;
|
|
while (a) {
|
|
#ifdef t8c
|
|
if (!((unsigned long)dest & 7) && a >= 2) {
|
|
#if defined(HAVE_GCC_ASSEMBLER) && defined(__x86_64__)
|
|
int tmp;
|
|
__asm__ volatile ("rep stosq" : "=D"(dest), "=c"(tmp) : "D"(dest), "c"(a >> 1), "a"(vv) : "memory");
|
|
a &= 1;
|
|
#else
|
|
do {
|
|
*((t8c *)dest) = vv;
|
|
dest += 8;
|
|
a -= 2;
|
|
} while (a >= 2);
|
|
#endif
|
|
} else
|
|
#endif
|
|
{
|
|
*(t4c *)dest = v;
|
|
dest += 4;
|
|
a--;
|
|
}
|
|
}
|
|
}
|
|
#else
|
|
unsigned char a, b, c, d;
|
|
int i;
|
|
|
|
a = *(unsigned char*)pattern;
|
|
b = ((unsigned char*)pattern)[1];
|
|
c = ((unsigned char*)pattern)[2];
|
|
d = ((unsigned char*)pattern)[3];
|
|
for (i = 0; i <= n - 4; i += 4) {
|
|
dest[i] = a;
|
|
dest[i + 1] = b;
|
|
dest[i + 2] = c;
|
|
dest[i + 3] = d;
|
|
}
|
|
#endif
|
|
}
|
|
break;
|
|
}
|
|
|
|
#if 0
|
|
default: {
|
|
int a;
|
|
for (a = 0; a < n / fb_pixelsize; a++, dest += fb_pixelsize) memcpy(dest, pattern, fb_pixelsize);
|
|
}
|
|
break;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
static inline void memcpy_to_fb_inline(unsigned char *dest, unsigned char *src, size_t len, int from_fb)
|
|
{
|
|
#ifdef HAVE_GCC_ASSEMBLER
|
|
#if defined(__i386__)
|
|
#define memcpy_to_fb_implemented
|
|
/* memcpy in glibc 2.17 has half the theoretical througput */
|
|
size_t tmp;
|
|
__asm__ volatile (" \n\
|
|
cld \n\
|
|
testw $3, %%di \n\
|
|
jz 1f \n\
|
|
testw $1, %%di \n\
|
|
jz 2f \n"
|
|
#ifdef __TINYC__
|
|
" testl %%ecx, %%ecx \n\
|
|
je 9f \n"
|
|
#else
|
|
" jecxz 9f \n"
|
|
#endif
|
|
" movsb \n\
|
|
decl %%ecx \n\
|
|
testw $2, %%di \n\
|
|
jz 1f \n\
|
|
2: movl %%ecx, %0 \n\
|
|
subl $2, %%ecx \n\
|
|
jb 4f \n\
|
|
movsw \n\
|
|
1: movl %%ecx, %0 \n\
|
|
shrl $2, %%ecx \n\
|
|
rep \n\
|
|
movsl \n\
|
|
testb $3, %b0 \n\
|
|
jz 9f \n\
|
|
testb $2, %b0 \n\
|
|
jz 3f \n\
|
|
movsw \n\
|
|
4: testb $1, %b0 \n\
|
|
jz 9f \n\
|
|
3: movsb \n\
|
|
9: " : "=q"(tmp), "=D"(dest), "=S"(src), "=c"(len) : "D"(dest), "S"(src), "c"(len) : "cc", "memory");
|
|
return;
|
|
#endif
|
|
#if defined(__x86_64__) && !defined(__TINYC__)
|
|
#define memcpy_to_fb_implemented
|
|
if (from_fb) {
|
|
size_t l;
|
|
unsigned char buffer[16];
|
|
static int have_sse41 = -1;
|
|
if (have_sse41 < 0) {
|
|
unsigned tmp_eax, ecx;
|
|
__asm__ ("pushq %%rbx; pushq %%rdx; cpuid; popq %%rdx; popq %%rbx" : "=a"(tmp_eax), "=c"(ecx) : "a"(1));
|
|
have_sse41 = (ecx >> 19) & 1;
|
|
}
|
|
if (!have_sse41)
|
|
goto use_memcpy;
|
|
l = -(size_t)src & 0xf;
|
|
if (l) {
|
|
__asm__ volatile ("movntdqa %1, %%xmm0; movdqu %%xmm0, %0" : "=m"(buffer) : "m"(*(src - 0x10 + l)) : "xmm0", "memory");
|
|
memcpy(dest, buffer + 0x10 - l, l < len ? l : len);
|
|
if (l >= len)
|
|
return;
|
|
dest += l;
|
|
src += l;
|
|
len -= l;
|
|
}
|
|
__asm__ volatile (" \n\
|
|
jmp 2f \n\
|
|
.p2align 4 \n\
|
|
1: movntdqa (%1), %%xmm0 \n\
|
|
add $16, %1 \n\
|
|
movdqu %%xmm0, (%0) \n\
|
|
add $16, %0 \n\
|
|
2: sub $16, %2 \n\
|
|
jae 1b \n\
|
|
" : "=r"(dest), "=r"(src), "=r"(len) : "0"(dest), "1"(src), "2"(len) : "xmm0", "memory", "cc");
|
|
l = len & 0xf;
|
|
if (l) {
|
|
__asm__ volatile ("movntdqa %1, %%xmm0; movdqu %%xmm0, %0" : "=m"(buffer) : "m"(*src) : "xmm0", "memory");
|
|
memcpy(dest, buffer, l);
|
|
}
|
|
return;
|
|
}
|
|
use_memcpy:
|
|
#endif
|
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_UNALIGNED)
|
|
#define memcpy_to_fb_implemented
|
|
/*
|
|
* This is not faster than the glibc memcpy, but there's some problem
|
|
* with data corruption in the PCIe controller with unaligned writes
|
|
*/
|
|
unsigned tmp32;
|
|
unsigned long tmp1, tmp2, tmp3, tmp4;
|
|
__asm__ volatile (" \n\
|
|
tbz %0, #0, 1f \n\
|
|
subs %2, %2, #1 \n\
|
|
b.cc 10f \n\
|
|
ldrb %w3, [ %1 ], #1 \n\
|
|
strb %w3, [ %0 ], #1 \n\
|
|
1: \n\
|
|
tbz %0, #1, 2f \n\
|
|
subs %2, %2, #2 \n\
|
|
b.cc 9f \n\
|
|
ldrh %w3, [ %1 ], #2 \n\
|
|
strh %w3, [ %0 ], #2 \n\
|
|
2: \n\
|
|
tbz %0, #2, 3f \n\
|
|
subs %2, %2, #4 \n\
|
|
b.cc 8f \n\
|
|
ldr %w3, [ %1 ], #4 \n\
|
|
str %w3, [ %0 ], #4 \n\
|
|
3: \n\
|
|
tbz %0, #3, 4f \n\
|
|
subs %2, %2, #8 \n\
|
|
b.cc 7f \n\
|
|
ldr %4, [ %1 ], #8 \n\
|
|
str %4, [ %0 ], #8 \n\
|
|
4: \n\
|
|
subs %2, %2, #16 \n\
|
|
b.cc 6f \n\
|
|
\n\
|
|
tbnz %2, #4, 55f \n\
|
|
ldp %4, %5, [ %1 ], #16 \n\
|
|
stp %4, %5, [ %0 ], #16 \n\
|
|
subs %2, %2, #16 \n\
|
|
b.cc 6f \n\
|
|
55: \n\
|
|
add %0, %0, #16 \n\
|
|
add %1, %1, #16 \n\
|
|
.p2align 3 \n\
|
|
5: \n\
|
|
ldp %4, %5, [ %1, #-16 ] \n\
|
|
ldp %6, %7, [ %1 ], #32 \n\
|
|
subs %2, %2, #32 \n\
|
|
stp %4, %5, [ %0, #-16 ] \n\
|
|
stp %6, %7, [ %0 ], #32 \n\
|
|
b.cs 5b \n\
|
|
sub %0, %0, #16 \n\
|
|
sub %1, %1, #16 \n\
|
|
6: \n\
|
|
tbz %2, #3, 7f \n\
|
|
ldr %4, [ %1 ], #8 \n\
|
|
str %4, [ %0 ], #8 \n\
|
|
7: \n\
|
|
tbz %2, #2, 8f \n\
|
|
ldr %w3, [ %1 ], #4 \n\
|
|
str %w3, [ %0 ], #4 \n\
|
|
8: \n\
|
|
tbz %2, #1, 9f \n\
|
|
ldrh %w3, [ %1 ], #2 \n\
|
|
strh %w3, [ %0 ], #2 \n\
|
|
9: \n\
|
|
tbz %2, #0, 10f \n\
|
|
ldrb %w3, [ %1 ] \n\
|
|
strb %w3, [ %0 ] \n\
|
|
10: \n\
|
|
" : "=r"(dest), "=r"(src), "=r"(len), "=r"(tmp32), "=r"(tmp1), "=r"(tmp2), "=r"(tmp3), "=r"(tmp4) : "0"(dest), "1"(src), "2"(len) : "cc", "memory");
|
|
return;
|
|
#endif
|
|
#if defined(__alpha__)
|
|
#define memcpy_to_fb_implemented
|
|
/*
|
|
* The glibc memcpy is very slow because it reads the same value
|
|
* from the framebuffer multiple times.
|
|
* There's no point in unrolling the loops because performance is
|
|
* limited by the bus.
|
|
*/
|
|
unsigned long tmp1, tmp2, tmp3, tmp4;
|
|
__asm__ volatile (" \n\
|
|
amask 1, $22 \n\
|
|
subq $31, %0, %3 \n\
|
|
beq %2, 6f \n\
|
|
addq %0, %2, %4 \n\
|
|
and %3, 7, %3 \n\
|
|
addq %1, %3, %6 \n\
|
|
cmpult %2, %3, %5 \n\
|
|
beq $22, 20f \n\
|
|
and %4, 3, $23 \n\
|
|
or %5, $23, $23 \n\
|
|
beq $23, 7f \n\
|
|
ldq_u $25, -1(%4) \n\
|
|
7: beq %3, 1f \n\
|
|
blbs %5, 12f \n\
|
|
and %3, 3, $24 \n\
|
|
beq $24, 8f \n\
|
|
ldq_u %4, 0(%0) \n\
|
|
8: ldq_u %5, 0(%1) \n\
|
|
subq %2, %3, %2 \n\
|
|
ldq_u %6, -1(%6) \n\
|
|
extql %5, %1, %5 \n\
|
|
extqh %6, %1, %6 \n\
|
|
addq %1, %3, %1 \n\
|
|
or %5, %6, %5 \n\
|
|
beq $24, 9f \n\
|
|
insql %5, %0, %5 \n\
|
|
mskql %4, %0, %4 \n\
|
|
or %5, %4, %4 \n\
|
|
stq_u %4, 0(%0) \n\
|
|
br 10f \n\
|
|
9: stl %5, 0(%0) \n\
|
|
10: addq %0, %3, %0 \n\
|
|
\n\
|
|
1: subq %2, 8, %2 \n\
|
|
blt %2, 4f \n\
|
|
and %1, 7, %4 \n\
|
|
bne %4, 2f \n\
|
|
.p2align 3 \n\
|
|
3: ldq %3, 0(%1) \n\
|
|
addq %1, 8, %1 \n\
|
|
subq %2, 8, %2 \n\
|
|
stq %3, 0(%0) \n\
|
|
addq %0, 8, %0 \n\
|
|
bge %2, 3b \n\
|
|
br 4f \n\
|
|
\n\
|
|
2: ldq_u %5, 0(%1) \n\
|
|
.p2align 3 \n\
|
|
5: ldq_u %4, 8(%1) \n\
|
|
extql %5, %1, %5 \n\
|
|
extqh %4, %1, %6 \n\
|
|
subq %2, 8, %2 \n\
|
|
or %5, %6, %5 \n\
|
|
addq %1, 8, %1 \n\
|
|
stq %5, 0(%0) \n\
|
|
mov %4, %5 \n\
|
|
addq %0, 8, %0 \n\
|
|
bge %2, 5b \n\
|
|
\n\
|
|
4: and %2, 7, %2 \n\
|
|
12: beq %2, 6f \n\
|
|
addq %1, %2, %6 \n\
|
|
ldq_u $24, 0(%1) \n\
|
|
ldq_u %6, -1(%6) \n\
|
|
extql $24, %1, $24 \n\
|
|
extqh %6, %1, %6 \n\
|
|
or $24, %6, $24 \n\
|
|
beq $22, 24f \n\
|
|
beq $23, 11f \n\
|
|
mskql $24, %2, $24 \n\
|
|
addq %0, %2, %3 \n\
|
|
insql $24, %0, $24 \n\
|
|
mskql $25, %0, %6 \n\
|
|
mskqh $25, %3, $25 \n\
|
|
or $25, %6, $25 \n\
|
|
or $25, $24, $24 \n\
|
|
stq_u $24, 0(%0) \n\
|
|
br 6f \n\
|
|
\n\
|
|
11: stl $24, 0(%0) \n\
|
|
br 6f \n\
|
|
\n\
|
|
20: beq %3, 1b \n\
|
|
addq %1, %2, %4 \n\
|
|
cmovlbs %5, %4, %6 \n\
|
|
ldq_u $24, 0(%1) \n\
|
|
mov %0, $23 \n\
|
|
ldq_u $25, -1(%6) \n\
|
|
extql $24, %1, $24 \n\
|
|
extqh $25, %1, $25 \n\
|
|
addq %0, %3, %0 \n\
|
|
or $24, $25, $24 \n\
|
|
addq %1, %3, %1 \n\
|
|
blbc $23, 21f \n\
|
|
/*stb $24, 0($23)*/ \n\
|
|
.long 0x3b170000 \n\
|
|
addq $23, 1, $23 \n\
|
|
subq %2, 1, %2 \n\
|
|
srl $24, 8, $24 \n\
|
|
21: and $23, 2, %4 \n\
|
|
beq %4, 22f \n\
|
|
subq %2, 2, %2 \n\
|
|
blt %2, 26f \n\
|
|
/*stw $24, 0($23)*/ \n\
|
|
.long 0x37170000 \n\
|
|
addq $23, 2, $23 \n\
|
|
srl $24, 16, $24 \n\
|
|
22: and $23, 4, %4 \n\
|
|
beq %4, 1b \n\
|
|
subq %2, 4, %2 \n\
|
|
blt %2, 25f \n\
|
|
stl $24, 0($23) \n\
|
|
23: br 1b \n\
|
|
\n\
|
|
24: and %2, 4, %4 \n\
|
|
mov %0, $23 \n\
|
|
beq %4, 25f \n\
|
|
stl $24, 0(%0) \n\
|
|
addq $23, 4, $23 \n\
|
|
srl $24, 32, $24 \n\
|
|
25: and %2, 2, %4 \n\
|
|
beq %4, 26f \n\
|
|
/*stw $24, 0($23)*/ \n\
|
|
.long 0x37170000 \n\
|
|
addq $23, 2, $23 \n\
|
|
srl $24, 16, $24 \n\
|
|
26: blbc %2, 6f \n\
|
|
/*stb $24, 0($23)*/ \n\
|
|
.long 0x3b170000 \n\
|
|
\n\
|
|
6: \n\
|
|
" : "=r"(dest), "=r"(src), "=r"(len), "=r"(tmp1), "=r"(tmp2), "=r"(tmp3), "=r"(tmp4) : "0"(dest), "1"(src), "2"(len) : "22", "23", "24", "25", "cc", "memory");
|
|
return;
|
|
#endif
|
|
#endif
|
|
memcpy(dest, src, len);
|
|
}
|
|
|
|
static
|
|
#ifdef memcpy_to_fb_implemented
|
|
ATTR_NOINLINE
|
|
#else
|
|
inline
|
|
#endif
|
|
void memcpy_to_fb(unsigned char *dest, unsigned char *src, size_t len, int from_fb)
|
|
{
|
|
memcpy_to_fb_inline(dest, src, len, from_fb);
|
|
}
|
|
|
|
static void memmove_in_fb(unsigned char *dest, unsigned char *src, size_t len)
|
|
{
|
|
#ifdef memcpy_to_fb_implemented
|
|
static unsigned char *buffer = NULL;
|
|
static size_t buffer_len = 0;
|
|
if (!len)
|
|
return;
|
|
if (len > buffer_len) {
|
|
unsigned char *new_buffer = malloc(len);
|
|
if (!new_buffer)
|
|
goto fallback_to_memmove;
|
|
if (buffer)
|
|
free(buffer);
|
|
buffer = new_buffer;
|
|
}
|
|
memcpy_to_fb(buffer, src, len, 1);
|
|
memcpy_to_fb(dest, buffer, len, 0);
|
|
return;
|
|
|
|
fallback_to_memmove:
|
|
#endif
|
|
memmove(dest, src, len);
|
|
}
|