--- snes9x-1.51-src-orig/unix/x11.cpp 2007-04-29 09:51:08 +0900 +++ snes9x-1.51-src/unix/x11.cpp 2010-11-17 23:00:43 +0900 @@ -170,6 +170,7 @@ #include #include #include +#include #include "snes9x.h" #include "memmap.h" @@ -574,6 +575,20 @@ XSetWindowAttributes attrib; + if (GUI.interpolate == 6) + { + if (GUI.depth >= 24) + { + GUI.window_width = 960; + GUI.window_height = 720; + } + else + { + fprintf(stderr, "Interpolation mode 6 requires a 24/32-bit video mode, switching to mode 1\n"); + GUI.interpolate = 1; + } + } + attrib.background_pixel = BlackPixelOfScreen (GUI.screen); GUI.window = XCreateWindow (GUI.display, RootWindowOfScreen (GUI.screen), (WidthOfScreen(GUI.screen) - GUI.window_width) / 2, @@ -873,7 +888,7 @@ memset (GUI.delta_screen, 0xff, GFX.Pitch * h); if (GUI.interpolated_screen) ZeroMemory (GUI.interpolated_screen, 512 * 478 * 2); - if (GUI.interpolate) + if (GUI.interpolate && GUI.interpolate != 6) { // Offset the rendering of the SNES image by at least one pixel because // Kreed's interpolation routines read one pixel beyond the bounds of @@ -1046,7 +1061,7 @@ if (GUI.interpolate) { - if (snes_width == 512 && snes_height > 240 && GUI.interpolate != 5) + if (snes_width == 512 && snes_height > 240 && GUI.interpolate != 5 && GUI.interpolate != 6) { GUI.output_screen = (uint8*)GFX.Screen; GUI.output_pitch = GFX.Pitch; @@ -1079,7 +1094,7 @@ } else { - if (GUI.interpolate != 5) + if (GUI.interpolate != 5 && GUI.interpolate != 6) { width = 512; if (snes_height < 240) @@ -1093,7 +1108,7 @@ width = GUI.window_width; cheight = height = GUI.window_height; } - if (GUI.image_needs_scaling || GUI.interpolate == 5) + if (GUI.image_needs_scaling || GUI.interpolate == 5 || GUI.interpolate == 6) { GUI.box.x = 0; GUI.box.y = 0; @@ -1105,8 +1120,9 @@ } // Kreed's bi-linear image filter scales as well - if ((GUI.image_needs_scaling && GUI.interpolate != 5) || - (GUI.depth != 15 && GUI.depth != 16)) + if (GUI.interpolate != 6 && + ((GUI.image_needs_scaling && GUI.interpolate != 5) || + (GUI.depth != 15 && GUI.depth != 16))) { GUI.output_screen = GUI.interpolated_screen; GUI.output_pitch = 512 * 2; @@ -1166,8 +1182,9 @@ } } - if ((GUI.depth != 15 && GUI.depth != 16) || - (GUI.image_needs_scaling && GUI.interpolate != 5)) + if (GUI.interpolate != 6 && + ((GUI.depth != 15 && GUI.depth != 16) || + (GUI.image_needs_scaling && GUI.interpolate != 5))) { done = TRUE; switch (GUI.depth) @@ -2041,6 +2058,7 @@ S9xMessage(S9X_INFO, S9X_USAGE, "-y3 Enable Kreed's Super Eagle image processing"); S9xMessage(S9X_INFO, S9X_USAGE, "-y4 Enable Kreed's 2xSaI image processing"); S9xMessage(S9X_INFO, S9X_USAGE, "-y5 Enable Kreed's software bi-linear filtering"); + S9xMessage(S9X_INFO, S9X_USAGE, "-y6 Enable 3x scanline scaling"); S9xMessage(S9X_INFO, S9X_USAGE, "-GUI.interpolate Same as -y"); S9xMessage(S9X_INFO, S9X_USAGE, "-scale or -sc Scale image to fit window"); #ifdef USE_DGA_EXTENSION @@ -2076,6 +2094,7 @@ case '3': GUI.interpolate = 3; break; case '4': GUI.interpolate = 4; break; case '5': GUI.interpolate = 5; break; + case '6': GUI.interpolate = 6; break; } } else @@ -2091,6 +2110,7 @@ case '3': GUI.interpolate = 3; break; case '4': GUI.interpolate = 4; break; case '5': GUI.interpolate = 5; break; + case '6': GUI.interpolate = 6; break; } } else @@ -2121,9 +2142,474 @@ S9xSetInfoString (buffer); } +/* Relative height of a minimum-brightness scanline (absolute height + * divided by distance between adjacent scanlines). */ +#define SCANLINE_HEIGHT 0.6f +/* Relative distance over which the edges of a scanline fade to zero (in + * the same units as SCANLINE_HEIGHT and SCANLINE_BLEED). */ +#define SCANLINE_FADE 0.1f +/* Additional amount by which a maximum-brightness scanline "bleeds" (the + * difference between the relative height of a maximum-brightness scanline + * and SCANLINE_HEIGHT). Slower. */ +#define SCANLINE_BLEED 0.2f + +/* Make sure we use 64-bit registers for pointers on x86-64. */ +#ifdef __x86_64__ +# define RAX "%%rax" +# define RBX "%%rbx" +# define RCX "%%rcx" +# define RDX "%%rdx" +# define RSI "%%rsi" +# define RDI "%%rdi" +#else +# define RAX "%%eax" +# define RBX "%%ebx" +# define RCX "%%ecx" +# define RDX "%%edx" +# define RSI "%%esi" +# define RDI "%%edi" +#endif + +static inline float calc_yscale(float srcy0, float srcy1, float Y) +{ + /* Calculate how much of this line is actually displayed, in steps */ + + if (srcy1 <= srcy0) + return 0; + +#ifdef SCANLINE_BLEED + const float height = SCANLINE_HEIGHT + SCANLINE_BLEED*Y; +#else + const float height = SCANLINE_HEIGHT; +#endif + const float y1 = 0.5f - height/2 - SCANLINE_FADE; + const float y2 = 0.5f - height/2; + const float y3 = 0.5f + height/2; + const float y4 = 0.5f + height/2 + SCANLINE_FADE; + float ytmp = srcy0 - floor(srcy0); + const float ymax = srcy1 - floor(srcy0); // not a typo! + float ytotal = 0.0f; + + if (ytmp < y1) { + if (srcy1 <= y1) { + goto finished; + } else { + ytmp = y1; + } + } + if (ytmp < y2) { + /* Just a simple linear falloff for now */ + if (ymax <= y2) { + ytotal += 0.5f*(ymax-y1)*(ymax-y1) - 0.5f*(ytmp-y1)*(ytmp-y1); + goto finished; + } else { + ytotal += 0.5f*(y2-y1)*(y2-y1) - 0.5f*(ytmp-y1)*(ytmp-y1); + ytmp = y2; + } + } + if (ytmp < y3) { + if (ymax <= y3) { + ytotal += ymax - ytmp; + goto finished; + } else { + ytotal += y3 - ytmp; + ytmp = y3; + } + } + if (ytmp < y4) { + /* Again, linear falloff for now */ + if (ymax <= y4) { + ytotal += 0.5f*(y4-ytmp)*(y4-ytmp) - 0.5f*(y4-ymax)*(y4-ymax); + } else { + ytotal += 0.5f*(y4-ytmp)*(y4-ytmp); + } + } + finished: + return ytotal; +} + +static void Scale_3x (uint8 *srcPtr, uint32 srcPitch, uint8 * /* deltaPtr */, + uint8 *dstPtr, uint32 dstPitch, int width, int height, + int dstWidth, int dstHeight, int bpp) +{ + /* Parameters used for precalculated data */ + static int inw = 0, inh = 0, outw = 0, outh = 0; + /* The precalculated data itself */ + static uint16_t *x_source = NULL; // Source for each X pixel + static float *x_weight1 = NULL; // Weight for left (base) pixel + static float *x_weight2 = NULL; // Weight for right (adjacent) pixel + static uint16_t *x_wgt16_1 = NULL; // As x_weight1, in 1.15 fixed point + static uint16_t *x_wgt16_2 = NULL; // As x_weight2, in 1.15 fixed point + + if (inw != width || inh != height + || outw != dstWidth || outh != dstHeight + ) { + /* Parameters changed (or first call), precalculate data */ + free(x_source); + x_source = NULL; + free(x_weight1); + x_weight1 = NULL; + free(x_weight2); + x_weight2 = NULL; + free(x_wgt16_1); + x_wgt16_1 = NULL; + free(x_wgt16_2); + x_wgt16_2 = NULL; + x_source = (uint16_t *)malloc(dstWidth*2); + x_weight1 = (float *)malloc(dstWidth*4); + x_weight2 = (float *)malloc(dstWidth*4); + x_wgt16_1 = (uint16_t *)malloc(dstWidth*2); + x_wgt16_2 = (uint16_t *)malloc(dstWidth*2); + if (!x_source || !x_weight1 || !x_weight2 || !x_wgt16_1 || !x_wgt16_2){ + fprintf(stderr, "Sorry, out of memory\n"); + exit(1); + } + int x; + for (x = 0; x < dstWidth; x++) { + const float srcx0 = x * width / (float)dstWidth; + const float srcx1 = (x+1) * width / (float)dstWidth; + const int floor0 = (int)floor(srcx0); + const int floor1 = (int)floor(srcx1); + const float xfrac = (floor0==floor1 ? 1.0f + : (floor1-srcx0) / (srcx1-srcx0)); + x_source [x] = floor0; + x_weight1[x] = xfrac; + x_weight2[x] = 1.0 - xfrac; + x_wgt16_1[x] = (int)(xfrac * 32768); + x_wgt16_2[x] = 32768 - x_wgt16_1[x]; + } + inw = width; + inh = height; + outw = dstWidth; + outh = dstHeight; + } + + + const int Bpp = bpp/8; +#ifdef SLOW_CLEVERNESS // not defined + const int Rofs = (GUI.red_shift==0 ? 0 : 2); + const int Gofs = 1; + const int Bofs = (GUI.blue_shift==0 ? 0 : 2); +#endif + int y; + + if (bpp != 24 && bpp != 32) { + fprintf(stderr, "Sorry, non-24/32bpp broken\n"); + exit(1); + } + for (y = 0; y < dstHeight; y++) { + const float srcy0 = y * height / (float)dstHeight; + const float srcy1 = (y+1) * height / (float)dstHeight; + const uint16_t *in0 = + (uint16_t *)(srcPtr + ((int)floor(srcy0)*srcPitch)); +#ifndef SLOW_CLEVERNESS + const uint16_t *in1 = + (uint16_t *)(srcPtr + ((int)floor(srcy1)*srcPitch)); +#endif + float yscale0, ybleed0, yscale1, ybleed1; + if (floor(srcy0) == floor(srcy1)) { + yscale0 = calc_yscale(srcy0, srcy1, 0) / (srcy1-srcy0); + ybleed0 = calc_yscale(srcy0, srcy1, 1) / (srcy1-srcy0) - yscale0; + yscale1 = 0; + ybleed1 = 0; + } else { + yscale0 = calc_yscale(srcy0, floor(srcy1), 0) / (srcy1-srcy0); + ybleed0 = calc_yscale(srcy0, floor(srcy1), 1) / (srcy1-srcy0) + - yscale0; + yscale1 = calc_yscale(floor(srcy1), srcy1, 0) / (srcy1-srcy0); + ybleed1 = calc_yscale(floor(srcy1), srcy1, 1) / (srcy1-srcy0) + - yscale1; + } + int x; +#ifdef SLOW_CLEVERNESS // not defined + for (x = 0; x < dstWidth; x++) { + const int srcx = x_source[x]; + const float x0frac = x_weight1[x]; + const float x1frac = x_weight2[x]; + const float R = ((in0[srcx ]>>11 & 0x1F) * x0frac) + + ((in0[srcx+1]>>11 & 0x1F) * x1frac); + const float G = ((in0[srcx ]>> 6 & 0x1F) * x0frac) + + ((in0[srcx+1]>> 6 & 0x1F) * x1frac); + const float B = ((in0[srcx ]>> 0 & 0x1F) * x0frac) + + ((in0[srcx+1]>> 0 & 0x1F) * x1frac); + const float Y = 0.299f*R + 0.587f*G + 0.114f*B; + const float yscale = calc_yscale(srcy0, srcy1, Y/31.0f) / (srcy1-srcy0) * (255.0f/31.0f); + dstPtr[y*dstPitch+x*Bpp+Rofs] = R * yscale; + dstPtr[y*dstPitch+x*Bpp+Gofs] = G * yscale; + dstPtr[y*dstPitch+x*Bpp+Bofs] = B * yscale; + } +#else // speed along! + /* Vectorized assembly; output width must be a multiple of 8, 32bpp + * BGRx only */ +#ifdef SCANLINE_BLEED + __attribute__((aligned(16))) static const struct {uint16_t i[24];} cdata = {{ + /* 0x00: RGB -> Y (unsigned) */ + 0.299/255*16777216, 0.299/255*16777216, + 0.299/255*16777216, 0.299/255*16777216, + 0.299/255*16777216, 0.299/255*16777216, + 0.299/255*16777216, 0.299/255*16777216, + 0.587/255*16777216, 0.587/255*16777216, + 0.587/255*16777216, 0.587/255*16777216, + 0.587/255*16777216, 0.587/255*16777216, + 0.587/255*16777216, 0.587/255*16777216, + 0.114/255*16777216, 0.114/255*16777216, + 0.114/255*16777216, 0.114/255*16777216, + 0.114/255*16777216, 0.114/255*16777216, + 0.114/255*16777216, 0.114/255*16777216, + }}; +#endif + __attribute__((aligned(16))) struct {uint16_t i[144];} vdata; + /* 0x000: in0: R0, G0, B0, R1, G1, B1 */ + /* 0x060: xNfrac0, xNfrac1 */ + /* 0x080: yscale0, ybleed0 */ + /* 0x0A0: in1: R0, G0, B0, R1, G1, B1 */ + /* 0x100: yscale1, ybleed1 */ + memset(&vdata, 0x80, sizeof(vdata)); + vdata.i[64] = vdata.i[65] = vdata.i[66] = vdata.i[67] = + vdata.i[68] = vdata.i[69] = vdata.i[70] = vdata.i[71] = yscale0*512; + vdata.i[72] = vdata.i[73] = vdata.i[74] = vdata.i[75] = + vdata.i[76] = vdata.i[77] = vdata.i[78] = vdata.i[79] = ybleed0*1024; + vdata.i[128]= vdata.i[129]= vdata.i[130]= vdata.i[131]= + vdata.i[132]= vdata.i[133]= vdata.i[134]= vdata.i[135]= yscale1*512; + vdata.i[136]= vdata.i[137]= vdata.i[138]= vdata.i[139]= + vdata.i[140]= vdata.i[141]= vdata.i[142]= vdata.i[143]= ybleed1*1024; + for (x = 0; x < dstWidth; x += 8) { + int i; +#if 0 // SSE2 routine, 2500us @ 896x672 -- but not fixed to handle 16bpp input + asm("movq ("RSI",%1,4), %%xmm0" + : : "S" (in0), "r" ((long)x_source[x ])); + asm("movq ("RSI",%1,4), %%xmm1" + : : "S" (in0), "r" ((long)x_source[x+1])); + asm("movq ("RSI",%1,4), %%xmm2" + : : "S" (in0), "r" ((long)x_source[x+2])); + asm("movq ("RSI",%1,4), %%xmm3" + : : "S" (in0), "r" ((long)x_source[x+3])); + asm("movq ("RSI",%1,4), %%xmm4" + : : "S" (in0), "r" ((long)x_source[x+4])); + asm("movq ("RSI",%1,4), %%xmm5" + : : "S" (in0), "r" ((long)x_source[x+5])); + asm("movq ("RSI",%1,4), %%xmm6" + : : "S" (in0), "r" ((long)x_source[x+6])); + asm("movq ("RSI",%1,4), %%xmm7" + : : "S" (in0), "r" ((long)x_source[x+7])); + asm("punpcklbw %%xmm1, %%xmm0 # XMM0: RGBb,a RGB1,0 \n\ + punpcklbw %%xmm3, %%xmm2 # XMM2: RGBd,c RGB3,2 \n\ + punpcklbw %%xmm5, %%xmm4 # XMM4: RGBf,e RGB5,4 \n\ + punpcklbw %%xmm7, %%xmm6 # XMM6: RGBh,g RGB7,6 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: RGBb,a RGB1,0 \n\ + movdqa %%xmm4, %%xmm5 # XMM5: RGBf,e RGB5,4 \n\ + punpcklwd %%xmm2, %%xmm0 # XMM0: R3-R0G3-G0B3-B0 \n\ + punpckhwd %%xmm2, %%xmm1 # XMM1: Rd-RaGd-GaBd-Ba \n\ + punpcklwd %%xmm6, %%xmm4 # XMM4: R7-R4G7-G4B7-B4 \n\ + punpckhwd %%xmm6, %%xmm5 # XMM5: Rh-ReGh-GeBh-Be \n\ + movdqa %%xmm0, %%xmm2 # XMM1: R3-R0G3-G0B3-B0 \n\ + punpckldq %%xmm4, %%xmm0 # XMM0: G7-G0B7-B0 \n\ + punpckhdq %%xmm4, %%xmm2 # XMM1: R7-R0 \n\ + movdqa ("RDI"), %%xmm3 # XMM3: 0x80 x 16 \n\ + movdqa %%xmm3, %%xmm6 # XMM6: 0x80 x 16 \n\ + movdqa %%xmm3, %%xmm7 # XMM7: 0x80 x 16 \n\ + punpcklbw %%xmm2, %%xmm3 # XMM3: R7 R6 ... R0 \n\ + punpckhbw %%xmm0, %%xmm6 # XMM6: G7 G6 ... G0 \n\ + punpcklbw %%xmm0, %%xmm7 # XMM7: B7 B6 ... B0 \n\ + movdqa %%xmm3, ("RDI") \n\ + movdqa %%xmm6, 0x10("RDI") \n\ + movdqa %%xmm7, 0x20("RDI") \n\ + movdqa %%xmm1, %%xmm2 # XMM2: Rd-RaGd-GaBd-Ba \n\ + punpckldq %%xmm5, %%xmm1 # XMM0: Gh-GaBh-Ba \n\ + punpckhdq %%xmm5, %%xmm2 # XMM1: Rh-Ra \n\ + movdqa 0x40("RDI"), %%xmm3 # XMM3: 0x80 x 16 \n\ + movdqa %%xmm3, %%xmm6 # XMM6: 0x80 x 16 \n\ + movdqa %%xmm3, %%xmm7 # XMM7: 0x80 x 16 \n\ + punpcklbw %%xmm2, %%xmm3 # XMM3: Rh Rg ... Ra \n\ + punpckhbw %%xmm1, %%xmm6 # XMM6: Gh Gg ... Ga \n\ + punpcklbw %%xmm1, %%xmm7 # XMM7: Bh Bg ... Ba \n\ + movdqa %%xmm3, 0x30("RDI") \n\ + movdqa %%xmm6, 0x40("RDI") \n\ + movdqa %%xmm7, 0x50("RDI") \n" + : /* no outputs */ + : "D" (&vdata), "m" (vdata) + ); +#endif + for (i = 0; i < 8; i++) { +#if 1 // simple routine, 3300us @ 896x672 + asm("\ + mov ("RSI",%1,2), %%eax \n\ + mov %%eax, %%edx \n\ + and $0xF81FF81F, %%eax \n\ + and $0x07C007C0, %%edx \n\ + mov %%ah, 1("RDI",%3,2) \n\ + ror $16, %%eax \n\ + mov %%ah, 49("RDI",%3,2) \n\ + shr $3, %%edx \n\ + shl $3, %%eax \n\ + mov %%dl, 17("RDI",%3,2) \n\ + mov %%al, 81("RDI",%3,2) \n\ + shr $16, %%edx \n\ + shr $16, %%eax \n\ + mov %%dl, 65("RDI",%3,2) \n\ + mov %%al, 33("RDI",%3,2) \n" + : /* no outputs */ + : "S" (in0), "r" ((long)x_source[x+i]), "D" (&vdata), + "r" ((long)i), "m" (vdata) + : "eax", "edx" + ); +#endif + vdata.i[i+48] = x_wgt16_1[x+i]; + vdata.i[i+56] = x_wgt16_2[x+i]; + if (vdata.i[128] /*yscale1*/ || vdata.i[136] /*ybleed1*/) { + asm("\ + mov ("RSI",%1,4), %%eax \n\ + mov 4("RSI",%1,4), %%edx \n\ + mov %%al, 193("RDI",%3,2) \n\ + mov %%dl, 241("RDI",%3,2) \n\ + mov %%ah, 177("RDI",%3,2) \n\ + mov %%dh, 225("RDI",%3,2) \n\ + bswap %%eax \n\ + bswap %%edx \n\ + mov %%ah, 161("RDI",%3,2) \n\ + mov %%dh, 209("RDI",%3,2) \n" + : /* no outputs */ + : "S" (in1), "r" ((long)x_source[x+i]), "D" (&vdata), +#ifdef __x86_64__ // Need a classic register for %ah/%dh to work. + "c" ((long)i), +#else + "r" ((long)i), +#endif + "m" (vdata) + : "eax", "edx" + ); + } + } + // 1850us @ 896x672 + asm("\ + # Load input RGB data, and merge RGB values as dictated \n\ + # by X weights \n\ + movdqa 0x60("RSI"), %%xmm6 # XMM6: Xn0 <<15 \n\ + movdqa 0x70("RSI"), %%xmm7 # XMM7: Xn1 <<15 \n\ + movdqa 0x00("RSI"), %%xmm0 # XMM0: Rn0 <<8 \n\ + pmulhuw %%xmm6, %%xmm0 # XMM0: Rn0 scaled <<7 \n\ + movdqa 0x10("RSI"), %%xmm1 # XMM1: Gn0 <<8 \n\ + pmulhuw %%xmm6, %%xmm1 # XMM1: Gn0 scaled <<7 \n\ + movdqa 0x20("RSI"), %%xmm2 # XMM2: Bn0 <<8 \n\ + pmulhuw %%xmm6, %%xmm2 # XMM2: Bn0 scaled <<7 \n\ + movdqa 0x30("RSI"), %%xmm3 # XMM3: Rn1 <<8 \n\ + pmulhuw %%xmm7, %%xmm3 # XMM3: Rn1 scaled <<7 \n\ + movdqa 0x40("RSI"), %%xmm4 # XMM4: Gn1 <<8 \n\ + pmulhuw %%xmm7, %%xmm4 # XMM4: Gn1 scaled <<7 \n\ + movdqa 0x50("RSI"), %%xmm5 # XMM5: Bn1 <<8 \n\ + pmulhuw %%xmm7, %%xmm5 # XMM5: Bn1 scaled <<7 \n\ + paddw %%xmm3, %%xmm0 # XMM0: Rn <<7 \n\ + paddw %%xmm4, %%xmm1 # XMM1: Gn <<7 \n\ + paddw %%xmm5, %%xmm2 # XMM2: Bn <<7 \n\ + # Load Y factor \n\ + movdqa 0x80("RSI"), %%xmm7 # XMM7: yscale0 <<9 \n" +#ifdef SCANLINE_BLEED +" # See if we need to do a bleed check \n\ + mov 0x90("RSI"), %%eax \n\ + test %%eax, %%eax \n\ + jz 0f \n\ + # Convert RGB to luma for bleeding \n\ + movdqa %%xmm0, %%xmm3 # XMM3: Rn <<7 \n\ + pmulhuw ("RBX"), %%xmm3 # XMM3: YRn <<15 \n\ + movdqa %%xmm1, %%xmm4 # XMM4: Gn <<7 \n\ + pmulhuw 0x10("RBX"), %%xmm4 # XMM4: YGn <<15 \n\ + movdqa %%xmm2, %%xmm5 # XMM5: Bn <<7 \n\ + pmulhuw 0x20("RBX"), %%xmm5 # XMM5: YBn <<15 \n\ + paddw %%xmm4, %%xmm3 \n\ + paddw %%xmm5, %%xmm3 # XMM3: Y <<15 \n\ + # Calculate bleed factor and add to Y factor \n\ + pmulhuw 0x90("RSI"), %%xmm3 # XMM3: bleed <<9 \n\ + paddw %%xmm3, %%xmm7 # XMM7: new yscale0 <<9 \n" +#endif +"0: # Scale by Y factor \n\ + pmulhuw %%xmm7, %%xmm0 # XMM0: new Rn \n\ + pmulhuw %%xmm7, %%xmm1 # XMM1: new Gn \n\ + pmulhuw %%xmm7, %%xmm2 # XMM2: new Bn \n\ + # See if we have a second scanline to process \n\ + movzwl 0x90("RSI"), %%eax \n\ + test %%eax, %%eax \n\ + jz 1f \n\ + # We do; first save the current RGB (overwrite the old \n\ + # source pixels, as we don't need them anymore) \n\ + movdqa %%xmm0, ("RSI") \n\ + movdqa %%xmm1, 0x10("RSI") \n\ + movdqa %%xmm2, 0x20("RSI") \n\ + # Process the second scanline just as above \n\ + movdqa 0x60("RSI"), %%xmm6 # XMM6: Xn0 <<15 \n\ + movdqa 0x70("RSI"), %%xmm7 # XMM7: Xn1 <<15 \n\ + movdqa 0xA0("RSI"), %%xmm0 # XMM0: Rn0 <<8 \n\ + pmulhuw %%xmm6, %%xmm0 # XMM0: Rn0 scaled <<7 \n\ + movdqa 0xB0("RSI"), %%xmm1 # XMM1: Gn0 <<8 \n\ + pmulhuw %%xmm6, %%xmm1 # XMM1: Gn0 scaled <<7 \n\ + movdqa 0xC0("RSI"), %%xmm2 # XMM2: Bn0 <<8 \n\ + pmulhuw %%xmm6, %%xmm2 # XMM2: Bn0 scaled <<7 \n\ + movdqa 0xD0("RSI"), %%xmm3 # XMM3: Rn1 <<8 \n\ + pmulhuw %%xmm7, %%xmm3 # XMM3: Rn1 scaled <<7 \n\ + movdqa 0xE0("RSI"), %%xmm4 # XMM4: Gn1 <<8 \n\ + pmulhuw %%xmm7, %%xmm4 # XMM4: Gn1 scaled <<7 \n\ + movdqa 0xF0("RSI"), %%xmm5 # XMM5: Bn1 <<8 \n\ + pmulhuw %%xmm7, %%xmm5 # XMM5: Bn1 scaled <<7 \n\ + paddw %%xmm3, %%xmm0 # XMM0: Rn <<7 \n\ + paddw %%xmm4, %%xmm1 # XMM1: Gn <<7 \n\ + paddw %%xmm5, %%xmm2 # XMM2: Bn <<7 \n\ + movdqa 0x100("RSI"), %%xmm7 # XMM7: yscale1 <<9 \n" +#ifdef SCANLINE_BLEED +" mov 0x110("RSI"), %%eax \n\ + test %%eax, %%eax \n\ + jz 0f \n\ + movdqa %%xmm0, %%xmm3 # XMM3: Rn <<7 \n\ + pmulhuw ("RBX"), %%xmm3 # XMM3: YRn <<15 \n\ + movdqa %%xmm1, %%xmm4 # XMM4: Gn <<7 \n\ + pmulhuw 0x10("RBX"), %%xmm4 # XMM4: YGn <<15 \n\ + movdqa %%xmm2, %%xmm5 # XMM5: Bn <<7 \n\ + pmulhuw 0x20("RBX"), %%xmm5 # XMM5: YBn <<15 \n\ + paddw %%xmm4, %%xmm3 \n\ + paddw %%xmm5, %%xmm3 # XMM3: Y <<15 \n\ + pmulhuw 0x110("RSI"), %%xmm3 # XMM3: bleed <<9 \n\ + paddw %%xmm3, %%xmm7 # XMM7: new yscale1 <<9 \n" +#endif +"0: # Scale by Y factor \n\ + pmulhuw %%xmm7, %%xmm0 # XMM0: new Rn \n\ + pmulhuw %%xmm7, %%xmm1 # XMM1: new Gn \n\ + pmulhuw %%xmm7, %%xmm2 # XMM2: new Bn \n\ + # Add in the first scanline's values \n\ + paddw ("RSI"), %%xmm0 \n\ + paddw 0x10("RSI"), %%xmm1 \n\ + paddw 0x20("RSI"), %%xmm2 \n\ +1: # Interleave and saturate RGB values \n\ + pxor %%xmm7, %%xmm7 # XMM7: 0's \n\ + movdqa %%xmm0, %%xmm4 # XMM4: Rn \n\ + movdqa %%xmm1, %%xmm5 # XMM5: Gn \n\ + movdqa %%xmm2, %%xmm6 # XMM6: Bn \n\ + punpcklwd %%xmm1, %%xmm2 # Interleave B and G \n\ + punpcklwd %%xmm7, %%xmm0 # Interleave R and 0 \n\ + packuswb %%xmm7, %%xmm2 # Saturate B and G \n\ + packuswb %%xmm7, %%xmm0 # Saturate R \n\ + punpcklwd %%xmm0, %%xmm2 # Interleave R/G/B \n\ + punpckhwd %%xmm5, %%xmm6 # Do it again (high wds)\n\ + punpckhwd %%xmm7, %%xmm4 \n\ + packuswb %%xmm7, %%xmm6 \n\ + packuswb %%xmm7, %%xmm4 \n\ + punpcklwd %%xmm4, %%xmm6 \n\ + # Store to destination \n\ + movdqu %%xmm2, ("RDI") \n\ + movdqu %%xmm6, 16("RDI") \n" + : "=m" (vdata) /* because it may be modified */ + : "S" (&vdata), "D" (dstPtr + y*dstPitch + x*Bpp) +#ifdef SCANLINE_BLEED + , "b" (&cdata), "m" (cdata) +#endif + : "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7" + ); + } +#endif // SLOW_CLEVERNESS or not + } +} + void TVMode (int width, int height) { - switch (width != 256 && GUI.interpolate != 5 ? 1 : GUI.interpolate) + switch (width != 256 && GUI.interpolate != 5 && GUI.interpolate != 6 ? 1 : GUI.interpolate) { case 2: Super2xSaI ((uint8*)GFX.Screen, GFX.Pitch, GUI.delta_screen, GUI.output_screen, @@ -2163,6 +2648,11 @@ Scale_2xSaI ((uint8*)GFX.Screen, GFX.Pitch, GUI.delta_screen, GUI.output_screen, GUI.output_pitch, GUI.window_width, GUI.window_height, width, height); break; + case 6: + Scale_3x ((uint8*)GFX.Screen, GFX.Pitch, GUI.delta_screen, GUI.output_screen, + GUI.output_pitch, width, height, + GUI.window_width, height>240 ? height*3/2 : height*3, GUI.image->bits_per_pixel); + break; case 1: { uint8 *nextLine, *srcPtr, *deltaPtr, *finish;