A little bit more blending and further brightened:
static inline void F0(uint16_t *out, uint16_t a, uint16_t b, uint16_t c)
{
*out++ = ( ((a&0xf800)*2/3) & 0xf800 ) |
(a&0x07e0) |
(a&0x001f);
*out++ = ( ((a&0xf800)*2/3 + (b&0xf800)/3) & 0xf800 ) |
( ((a&0x07e0)/3 + (b&0x07e0)*2/3) & 0x07e0 ) |
(b&0x001f);
*out++ = (b & 0xf800) |
( ((b&0x07e0)*2/3 + (c&0x07e0)/3) & 0x07e0 ) |
( ((b&0x001f)/3 + (c&0x001f)*2/3) & 0x001f );
*out++ = c;
}
static inline void F(uint16_t *out, uint16_t z, uint16_t a, uint16_t b, uint16_t c)
{
*out++ = (z & 0xf800) |
(z & 0x07e0) |
( ((z&0x001f)*2/3 + (a&0x001f)/3) & 0x001f );
*out++ = ( ((z&0xf800)/3 + (a&0xf800)*2/3) & 0xf800 ) |
(a&0x07e0) |
(a&0x001f);
*out++ = ( ((a&0xf800)*2/3 + (b&0xf800)/3) & 0xf800 ) |
( ((a&0x07e0)/3 + (b&0x07e0)*2/3) & 0x07e0 ) |
(b&0x001f);
*out++ = (b & 0xf800) |
( ((b&0x07e0)*2/3 + (c&0x07e0)/3) & 0x07e0 ) |
( ((b&0x001f)/3 + (c&0x001f)*2/3) & 0x001f );
*out++ = c;
}
...
static inline void gba_upscale_subpixel(uint16_t *to, uint16_t *from,
uint32_t src_x, uint32_t src_y, uint32_t src_pitch, uint32_t dst_pitch)
{
...
for (y = 0; y < src_y/2; y++) {
src = from;
dst = to;
for (x = 0; x < src_x/3; x++) {
a = bgr555_to_rgb565_16(src[0]);
b = bgr555_to_rgb565_16(src[1]);
c = bgr555_to_rgb565_16(src[2]);
d = bgr555_to_rgb565_16(src[sp]);
e = bgr555_to_rgb565_16(src[sp+1]);
f = bgr555_to_rgb565_16(src[sp+2]);
if (x == 0) {
F0(dst, a, b, c);
F0(&dst[dp], Z(a,d), Z(b,e), Z(c,f));
F0(&dst[dp*2], d, e, f);
} else {
F(dst-1, dst[-1], a, b, c);
F(&dst[dp-1], dst[dp-1], Z(a,d), Z(b,e), Z(c,f));
F(&dst[dp*2-1], dst[dp*2-1], d, e, f);
}
...
There's still that noticeable pattern unfortunately, but I think it looks better. :/
Edit: I've been playing with the scaler for a while. Trying to eliminate the pattern directly seems to restore the fringe (which makes sense since the whole point of sub-pixel rendering is to add a fringe that blends in, but it obviously doesn't do well with large solid blocks of the same color). So, I decided to use a PocketNES trick and do dual-frame blending. It gets rid of the pattern, but it does cause more of a visible fringe I think. :/ Feel free to toy with this.
static inline uint16_t bgr555_to_rgb565_16(uint16_t px)
{
return ((px & 0x7c00) >> 10)
| ((px & 0x03e0) << 1)
| ((px & 0x001f) << 11);
}
/* Tries to keep the least significant bits when averaging to avoid rippling */
#define Z0_(A,B) (((((A) >> 1) & 0x7bef) + (((B) >> 1) & 0x7bef)))
#define Z0(A,B) (Z0_(A,B) | ((Z0_(A,B)>>3) & 0x1803) | ((Z0_(A,B)>>4) & 0x60))
/* Directly uses A if A == B or mix the two. */
#define Z(A,B) ((A == B) ? A : Z0(A,B))
/* These macros define mixing RGB components from A then B with always more from A
* than B. So, M210 == 3/3 red A + 1/3 red, 1/3 green A + 2/3 green B, 3/3 blue B
* Note: M233 and M123 are special, only using A as they're used on the left border of the
* screen and hence there's nothing else to mix with.
#define M333(A,B) (A)
#define M233(A) ( ((((A)&0xF800)*2/3) & 0xF800) | ((A) & 0x07FF) )
#define M123(A) ( ((((A)&0xF800)/3) & 0xF800) | \
((((A)&0x07E0)*2/3) & 0x07E0) | \
((A) & 0x001F) )
#define M210(A,B) ( ((((A)&0xF800)*2/3 + ((B)&0xF800)/3) & 0xF800) | \
((((A)&0x07E0)/3 + ((B)&0x07E0)*2/3) & 0x07E0) | \
((B) & 0x001F) )
#define M321(A,B) ( ((A)&0xF800) | \
((((A)&0x07E0)*2/3 + ((B)&0x07E0)/3) & 0x07E0) | \
((((A)&0x001F)/3 + ((B)&0x001F)*2/3) & 0x001F) )
#define M332(A,B) ( ((A) & 0xFFE0) | \
((((A)&0x001F)*2/3 + ((B)&0x001F)/3) & 0x001F) )
#define M100(A,B) ( ((((A)&0xF800)/3 + ((A)&0xF800)*2/3) & 0xF800) | \
((B) & 0x07FF) )
static inline void F0(uint16_t *out, uint16_t a, uint16_t b, uint16_t c)
{
*out++ = M233(a);
*out++ = M210(a,b);
*out++ = M321(b,c);
*out++ = c;
}
static inline void F1(uint16_t *out, uint16_t a, uint16_t b, uint16_t c)
{
*out++ = M123(a);
*out++ = M321(b,c);
*out++ = M332(b,c);
*out++ = c;
}
static inline void F2(uint16_t *out, uint16_t a, uint16_t b, uint16_t c)
{
*out++ = M321(a,b);
*out++ = M332(b,c);
*out++ = M100(b,c);
*out++ = c;
}
static inline void F_0(uint16_t *out, uint16_t z, uint16_t a, uint16_t b, uint16_t c)
{
*out++ = M332(z,a);
*out++ = M100(z,a);
*out++ = M210(a,b);
*out++ = M321(b,c);
*out++ = c;
}
static inline void F_1(uint16_t *out, uint16_t y, uint16_t z, uint16_t a, uint16_t b, uint16_t c)
{
*out++ = M100(y,z);
*out++ = M210(z,a);
*out++ = M321(a,b);
*out++ = M332(b,c);
*out++ = c;
}
static inline void F_2(uint16_t *out, uint16_t z, uint16_t a, uint16_t b, uint16_t c)
{
*out++ = M210(z,a);
*out++ = M321(a,b);
*out++ = M332(b,c);
*out++ = M100(b,c);
*out++ = c;
}
static inline void gba_upscale_subpixel(uint16_t *to, uint16_t *from,
uint32_t src_x, uint32_t src_y, uint32_t src_pitch, uint32_t dst_pitch)
{
/* Before:
* RRRR RRRRrr rrrrrr RRRRRR RR
* GGGGGG GGgggg ggggGG GGGGGG
*BB BBBBBB bbbbbb bbBBBB BBBB
* After (merges r/R, g/G, b/B groups into four pixels)
* RR GG BB rr gg bb RR GG BB rr gg bb
* RR GG BB rr gg bb RR GG BB rr gg bb
* RR GG BB rr gg bb RR GG BB rr gg bb
*/
const uint32_t dst_x = src_x * 4 / 3;
uint16_t a, b, c, d, e, f;
uint16_t *src, *dst;
uint32_t x, y;
static int frame = 0;
frame = (frame + 1) % 3;
const uint32_t sp = src_pitch / sizeof(uint16_t), dp = dst_pitch / sizeof(uint16_t);
for (y = 0; y < src_y/2; y++) {
src = from;
dst = to;
if (frame == 0) {
for (x = 0; x < src_x/3; x++) {
a = bgr555_to_rgb565_16(src[0]);
b = bgr555_to_rgb565_16(src[1]);
c = bgr555_to_rgb565_16(src[2]);
d = bgr555_to_rgb565_16(src[sp]);
e = bgr555_to_rgb565_16(src[sp+1]);
f = bgr555_to_rgb565_16(src[sp+2]);
if (x == 0) {
F0(dst, a, b, c);
F0(&dst[dp*2], d, e, f);
dst[dp] = Z(dst[0],dst[dp*2]);
dst[dp+1] = Z(dst[1],dst[dp*2+1]);
dst[dp+2] = Z(dst[2],dst[dp*2+2]);
dst[dp+3] = Z(dst[3],dst[dp*2+3]);
} else {
F_0(dst-1, dst[-1], a, b, c);
F_0(&dst[dp*2-1], dst[dp*2-1], d, e, f);
dst[dp-1] = Z(dst[-1],dst[dp*2-1]);
dst[dp] = Z(dst[0],dst[dp*2]);
dst[dp+1] = Z(dst[1],dst[dp*2+1]);
dst[dp+2] = Z(dst[2],dst[dp*2+2]);
dst[dp+3] = Z(dst[3],dst[dp*2+3]);
}
src += 3;
dst += 4;
}
} else if (frame == 1) {
for (x = 0; x < src_x/3; x++) {
a = bgr555_to_rgb565_16(src[0]);
b = bgr555_to_rgb565_16(src[1]);
c = bgr555_to_rgb565_16(src[2]);
d = bgr555_to_rgb565_16(src[sp]);
e = bgr555_to_rgb565_16(src[sp+1]);
f = bgr555_to_rgb565_16(src[sp+2]);
if (x == 0) {
F1(dst, a, b, c);
// F1(&dst[dp], Z0(a,d), Z0(b, e), Z0(c, f));
F1(&dst[dp*2], d, e, f);
dst[dp] = Z(dst[0],dst[dp*2]);
dst[dp+1] = Z(dst[1],dst[dp*2+1]);
dst[dp+2] = Z(dst[2],dst[dp*2+2]);
dst[dp+3] = Z(dst[3],dst[dp*2+3]);
} else {
F_1(dst-1, dst[-2], dst[-1], a, b, c);
// F_1(&dst[dp-1], dst[dp-2], dst[dp-1], Z0(a,d), Z0(b,e), Z0(c,f));
F_1(&dst[dp*2-1], dst[dp*2-2], dst[dp*2-1], d, e, f);
dst[dp-1] = Z(dst[-1],dst[dp*2-1]);
dst[dp] = Z(dst[0],dst[dp*2]);
dst[dp+1] = Z(dst[1],dst[dp*2+1]);
dst[dp+2] = Z(dst[2],dst[dp*2+2]);
dst[dp+3] = Z(dst[3],dst[dp*2+3]);
}
src += 3;
dst += 4;
}
} else {
for (x = 0; x < src_x/3; x++) {
a = bgr555_to_rgb565_16(src[0]);
b = bgr555_to_rgb565_16(src[1]);
c = bgr555_to_rgb565_16(src[2]);
d = bgr555_to_rgb565_16(src[sp]);
e = bgr555_to_rgb565_16(src[sp+1]);
f = bgr555_to_rgb565_16(src[sp+2]);
if (x == 0) {
F2(dst, a, b, c);
// F1(&dst[dp], Z0(a,d), Z0(b, e), Z0(c, f));
F2(&dst[dp*2], d, e, f);
dst[dp] = Z(dst[0],dst[dp*2]);
dst[dp+1] = Z(dst[1],dst[dp*2+1]);
dst[dp+2] = Z(dst[2],dst[dp*2+2]);
dst[dp+3] = Z(dst[3],dst[dp*2+3]);
} else {
F_2(dst-1, dst[-1], a, b, c);
// F_1(&dst[dp-1], dst[dp-2], dst[dp-1], Z0(a,d), Z0(b,e), Z0(c,f));
F_2(&dst[dp*2-1], dst[dp*2-1], d, e, f);
dst[dp-1] = Z(dst[-1],dst[dp*2-1]);
dst[dp] = Z(dst[0],dst[dp*2]);
dst[dp+1] = Z(dst[1],dst[dp*2+1]);
dst[dp+2] = Z(dst[2],dst[dp*2+2]);
dst[dp+3] = Z(dst[3],dst[dp*2+3]);
}
src += 3;
dst += 4;
}
}
from = (uint16_t *) (((uint8_t *) from) + src_pitch * 2);
to = (uint16_t *) (((uint8_t *) to ) + dst_pitch * 3);
}
}