--- snes9x-1.51-src-orig/unix/x11.cpp	2007-04-29 09:51:08 +0900
+++ snes9x-1.51-src/unix/x11.cpp	2010-11-17 23:00:43 +0900
@@ -170,6 +170,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
+#include <math.h>
 
 #include "snes9x.h"
 #include "memmap.h"
@@ -574,6 +575,20 @@
 
     XSetWindowAttributes attrib;
 
+    if (GUI.interpolate == 6)
+    {
+	if (GUI.depth >= 24)
+	{
+	    GUI.window_width = 960;
+	    GUI.window_height = 720;
+	}
+	else
+	{
+	    fprintf(stderr, "Interpolation mode 6 requires a 24/32-bit video mode, switching to mode 1\n");
+	    GUI.interpolate = 1;
+	}
+    }
+
     attrib.background_pixel = BlackPixelOfScreen (GUI.screen);
     GUI.window = XCreateWindow (GUI.display, RootWindowOfScreen (GUI.screen),
 				(WidthOfScreen(GUI.screen) - GUI.window_width) / 2,
@@ -873,7 +888,7 @@
 	memset (GUI.delta_screen, 0xff, GFX.Pitch * h);
     if (GUI.interpolated_screen)
 	ZeroMemory (GUI.interpolated_screen, 512 * 478 * 2);
-    if (GUI.interpolate)
+    if (GUI.interpolate && GUI.interpolate != 6)
     {
 	// Offset the rendering of the SNES image by at least one pixel because
 	// Kreed's interpolation routines read one pixel beyond the bounds of
@@ -1046,7 +1061,7 @@
 
     if (GUI.interpolate)
     {
-	if (snes_width == 512 && snes_height > 240 && GUI.interpolate != 5)
+	if (snes_width == 512 && snes_height > 240 && GUI.interpolate != 5 && GUI.interpolate != 6)
 	{
 	    GUI.output_screen = (uint8*)GFX.Screen;
 	    GUI.output_pitch = GFX.Pitch;
@@ -1079,7 +1094,7 @@
 	}
 	else
 	{
-	    if (GUI.interpolate != 5)
+	    if (GUI.interpolate != 5 && GUI.interpolate != 6)
 	    {
 		width = 512;
 		if (snes_height < 240)
@@ -1093,7 +1108,7 @@
 		width = GUI.window_width;
 		cheight = height = GUI.window_height;
 	    }
-	    if (GUI.image_needs_scaling || GUI.interpolate == 5)
+	    if (GUI.image_needs_scaling || GUI.interpolate == 5 || GUI.interpolate == 6)
 	    {
 		GUI.box.x = 0;
 		GUI.box.y = 0;
@@ -1105,8 +1120,9 @@
 	    }
 
 	    // Kreed's bi-linear image filter scales as well
-	    if ((GUI.image_needs_scaling && GUI.interpolate != 5) ||
-		(GUI.depth != 15 && GUI.depth != 16))
+	    if (GUI.interpolate != 6 &&
+		((GUI.image_needs_scaling && GUI.interpolate != 5) ||
+		 (GUI.depth != 15 && GUI.depth != 16)))
 	    {
 		GUI.output_screen = GUI.interpolated_screen;
 		GUI.output_pitch = 512 * 2;
@@ -1166,8 +1182,9 @@
 	}
     }
 
-    if ((GUI.depth != 15 && GUI.depth != 16) ||
-	(GUI.image_needs_scaling && GUI.interpolate != 5))
+    if (GUI.interpolate != 6 &&
+	((GUI.depth != 15 && GUI.depth != 16) ||
+	 (GUI.image_needs_scaling && GUI.interpolate != 5)))
     {
 	done = TRUE;
 	switch (GUI.depth)
@@ -2041,6 +2058,7 @@
     S9xMessage(S9X_INFO, S9X_USAGE, "-y3                             Enable Kreed's Super Eagle image processing");
     S9xMessage(S9X_INFO, S9X_USAGE, "-y4                             Enable Kreed's 2xSaI image processing");
     S9xMessage(S9X_INFO, S9X_USAGE, "-y5                             Enable Kreed's software bi-linear filtering");
+    S9xMessage(S9X_INFO, S9X_USAGE, "-y6                             Enable 3x scanline scaling");
     S9xMessage(S9X_INFO, S9X_USAGE, "-GUI.interpolate<num>           Same as -y<num>");
     S9xMessage(S9X_INFO, S9X_USAGE, "-scale or -sc                   Scale image to fit window");
 #ifdef USE_DGA_EXTENSION
@@ -2076,6 +2094,7 @@
 	case '3':   GUI.interpolate = 3;	break;
 	case '4':   GUI.interpolate = 4;	break;
 	case '5':   GUI.interpolate = 5;	break;
+	case '6':   GUI.interpolate = 6;	break;
 	}
     }
     else
@@ -2091,6 +2110,7 @@
 	case '3':   GUI.interpolate = 3;	break;
 	case '4':   GUI.interpolate = 4;	break;
 	case '5':   GUI.interpolate = 5;	break;
+	case '6':   GUI.interpolate = 6;	break;
 	}
     }
     else
@@ -2121,9 +2142,474 @@
     S9xSetInfoString (buffer);
 }
 
+/* Relative height of a minimum-brightness scanline (absolute height
+ * divided by distance between adjacent scanlines). */
+#define SCANLINE_HEIGHT  0.6f
+/* Relative distance over which the edges of a scanline fade to zero (in
+ * the same units as SCANLINE_HEIGHT and SCANLINE_BLEED). */
+#define SCANLINE_FADE    0.1f
+/* Additional amount by which a maximum-brightness scanline "bleeds" (the
+ * difference between the relative height of a maximum-brightness scanline
+ * and SCANLINE_HEIGHT).  Slower. */
+#define SCANLINE_BLEED   0.2f
+
+/* Make sure we use 64-bit registers for pointers on x86-64. */
+#ifdef __x86_64__
+# define RAX "%%rax"
+# define RBX "%%rbx"
+# define RCX "%%rcx"
+# define RDX "%%rdx"
+# define RSI "%%rsi"
+# define RDI "%%rdi"
+#else
+# define RAX "%%eax"
+# define RBX "%%ebx"
+# define RCX "%%ecx"
+# define RDX "%%edx"
+# define RSI "%%esi"
+# define RDI "%%edi"
+#endif
+
+static inline float calc_yscale(float srcy0, float srcy1, float Y)
+{
+    /* Calculate how much of this line is actually displayed, in steps */
+
+    if (srcy1 <= srcy0)
+	return 0;
+
+#ifdef SCANLINE_BLEED
+    const float height = SCANLINE_HEIGHT + SCANLINE_BLEED*Y;
+#else
+    const float height = SCANLINE_HEIGHT;
+#endif
+    const float y1 = 0.5f - height/2 - SCANLINE_FADE;
+    const float y2 = 0.5f - height/2;
+    const float y3 = 0.5f + height/2;
+    const float y4 = 0.5f + height/2 + SCANLINE_FADE;
+          float ytmp = srcy0 - floor(srcy0);
+    const float ymax = srcy1 - floor(srcy0);  // not a typo!
+          float ytotal = 0.0f;
+
+    if (ytmp < y1) {
+	if (srcy1 <= y1) {
+	    goto finished;
+	} else {
+	    ytmp = y1;
+	}
+    }
+    if (ytmp < y2) {
+	/* Just a simple linear falloff for now */
+	if (ymax <= y2) {
+	    ytotal += 0.5f*(ymax-y1)*(ymax-y1) - 0.5f*(ytmp-y1)*(ytmp-y1);
+	    goto finished;
+	} else {
+	    ytotal += 0.5f*(y2-y1)*(y2-y1) - 0.5f*(ytmp-y1)*(ytmp-y1);
+	    ytmp = y2;
+	}
+    }
+    if (ytmp < y3) {
+	if (ymax <= y3) {
+	    ytotal += ymax - ytmp;
+	    goto finished;
+	} else {
+	    ytotal += y3 - ytmp;
+	    ytmp = y3;
+	}
+    }
+    if (ytmp < y4) {
+	/* Again, linear falloff for now */
+	if (ymax <= y4) {
+	    ytotal += 0.5f*(y4-ytmp)*(y4-ytmp) - 0.5f*(y4-ymax)*(y4-ymax);
+	} else {
+	    ytotal += 0.5f*(y4-ytmp)*(y4-ytmp);
+	}
+    }
+  finished:
+    return ytotal;
+}
+
+static void Scale_3x (uint8 *srcPtr, uint32 srcPitch, uint8 * /* deltaPtr */,
+		      uint8 *dstPtr, uint32 dstPitch, int width, int height,
+		      int dstWidth, int dstHeight, int bpp)
+{
+    /* Parameters used for precalculated data */
+    static int inw = 0, inh = 0, outw = 0, outh = 0;
+    /* The precalculated data itself */
+    static uint16_t *x_source = NULL;   // Source for each X pixel
+    static float *x_weight1 = NULL;     // Weight for left (base) pixel
+    static float *x_weight2 = NULL;     // Weight for right (adjacent) pixel
+    static uint16_t *x_wgt16_1 = NULL;  // As x_weight1, in 1.15 fixed point
+    static uint16_t *x_wgt16_2 = NULL;  // As x_weight2, in 1.15 fixed point
+
+    if (inw != width || inh != height
+     || outw != dstWidth || outh != dstHeight
+    ) {
+	/* Parameters changed (or first call), precalculate data */
+	free(x_source);
+	x_source = NULL;
+	free(x_weight1);
+	x_weight1 = NULL;
+	free(x_weight2);
+	x_weight2 = NULL;
+	free(x_wgt16_1);
+	x_wgt16_1 = NULL;
+	free(x_wgt16_2);
+	x_wgt16_2 = NULL;
+	x_source = (uint16_t *)malloc(dstWidth*2);
+	x_weight1 = (float *)malloc(dstWidth*4);
+	x_weight2 = (float *)malloc(dstWidth*4);
+	x_wgt16_1 = (uint16_t *)malloc(dstWidth*2);
+	x_wgt16_2 = (uint16_t *)malloc(dstWidth*2);
+	if (!x_source || !x_weight1 || !x_weight2 || !x_wgt16_1 || !x_wgt16_2){
+	    fprintf(stderr, "Sorry, out of memory\n");
+	    exit(1);
+	}
+	int x;
+	for (x = 0; x < dstWidth; x++) {
+	    const float srcx0 = x * width / (float)dstWidth;
+	    const float srcx1 = (x+1) * width / (float)dstWidth;
+	    const int floor0 = (int)floor(srcx0);
+	    const int floor1 = (int)floor(srcx1);
+	    const float xfrac = (floor0==floor1 ? 1.0f
+	                         : (floor1-srcx0) / (srcx1-srcx0));
+	    x_source [x] = floor0;
+	    x_weight1[x] = xfrac;
+	    x_weight2[x] = 1.0 - xfrac;
+	    x_wgt16_1[x] = (int)(xfrac * 32768);
+	    x_wgt16_2[x] = 32768 - x_wgt16_1[x];
+	}
+	inw = width;
+	inh = height;
+	outw = dstWidth;
+	outh = dstHeight;
+    }
+
+
+    const int Bpp = bpp/8;
+#ifdef SLOW_CLEVERNESS  // not defined
+    const int Rofs = (GUI.red_shift==0 ? 0 : 2);
+    const int Gofs = 1;
+    const int Bofs = (GUI.blue_shift==0 ? 0 : 2);
+#endif
+    int y;
+
+    if (bpp != 24 && bpp != 32) {
+	fprintf(stderr, "Sorry, non-24/32bpp broken\n");
+	exit(1);
+    }
+    for (y = 0; y < dstHeight; y++) {
+	const float srcy0 = y * height / (float)dstHeight;
+	const float srcy1 = (y+1) * height / (float)dstHeight;
+	const uint16_t *in0 =
+	    (uint16_t *)(srcPtr + ((int)floor(srcy0)*srcPitch));
+#ifndef SLOW_CLEVERNESS
+	const uint16_t *in1 =
+	    (uint16_t *)(srcPtr + ((int)floor(srcy1)*srcPitch));
+#endif
+	float yscale0, ybleed0, yscale1, ybleed1;
+	if (floor(srcy0) == floor(srcy1)) {
+	    yscale0 = calc_yscale(srcy0, srcy1, 0) / (srcy1-srcy0);
+	    ybleed0 = calc_yscale(srcy0, srcy1, 1) / (srcy1-srcy0) - yscale0;
+	    yscale1 = 0;
+	    ybleed1 = 0;
+	} else {
+	    yscale0 = calc_yscale(srcy0, floor(srcy1), 0) / (srcy1-srcy0);
+	    ybleed0 = calc_yscale(srcy0, floor(srcy1), 1) / (srcy1-srcy0)
+		    - yscale0;
+	    yscale1 = calc_yscale(floor(srcy1), srcy1, 0) / (srcy1-srcy0);
+	    ybleed1 = calc_yscale(floor(srcy1), srcy1, 1) / (srcy1-srcy0)
+	            - yscale1;
+	}
+	int x;
+#ifdef SLOW_CLEVERNESS  // not defined
+	for (x = 0; x < dstWidth; x++) {
+	    const int srcx = x_source[x];
+	    const float x0frac = x_weight1[x];
+	    const float x1frac = x_weight2[x];
+	    const float R = ((in0[srcx  ]>>11 & 0x1F) * x0frac)
+	                  + ((in0[srcx+1]>>11 & 0x1F) * x1frac);
+	    const float G = ((in0[srcx  ]>> 6 & 0x1F) * x0frac)
+	                  + ((in0[srcx+1]>> 6 & 0x1F) * x1frac);
+	    const float B = ((in0[srcx  ]>> 0 & 0x1F) * x0frac)
+	                  + ((in0[srcx+1]>> 0 & 0x1F) * x1frac);
+	    const float Y = 0.299f*R + 0.587f*G + 0.114f*B;
+	    const float yscale = calc_yscale(srcy0, srcy1, Y/31.0f) / (srcy1-srcy0) * (255.0f/31.0f);
+	    dstPtr[y*dstPitch+x*Bpp+Rofs] = R * yscale;
+	    dstPtr[y*dstPitch+x*Bpp+Gofs] = G * yscale;
+	    dstPtr[y*dstPitch+x*Bpp+Bofs] = B * yscale;
+	}
+#else  // speed along!
+	/* Vectorized assembly; output width must be a multiple of 8, 32bpp
+	 * BGRx only */
+#ifdef SCANLINE_BLEED
+	__attribute__((aligned(16))) static const struct {uint16_t i[24];} cdata = {{
+	    /* 0x00: RGB -> Y (unsigned) */
+	    0.299/255*16777216, 0.299/255*16777216,
+	    0.299/255*16777216, 0.299/255*16777216,
+	    0.299/255*16777216, 0.299/255*16777216,
+	    0.299/255*16777216, 0.299/255*16777216,
+	    0.587/255*16777216, 0.587/255*16777216,
+	    0.587/255*16777216, 0.587/255*16777216,
+	    0.587/255*16777216, 0.587/255*16777216,
+	    0.587/255*16777216, 0.587/255*16777216,
+	    0.114/255*16777216, 0.114/255*16777216,
+	    0.114/255*16777216, 0.114/255*16777216,
+	    0.114/255*16777216, 0.114/255*16777216,
+	    0.114/255*16777216, 0.114/255*16777216,
+        }};
+#endif
+	__attribute__((aligned(16))) struct {uint16_t i[144];} vdata;
+	    /* 0x000: in0: R0, G0, B0, R1, G1, B1 */
+	    /* 0x060: xNfrac0, xNfrac1 */
+	    /* 0x080: yscale0, ybleed0 */
+	    /* 0x0A0: in1: R0, G0, B0, R1, G1, B1 */
+	    /* 0x100: yscale1, ybleed1 */
+	memset(&vdata, 0x80, sizeof(vdata));
+	vdata.i[64] = vdata.i[65] = vdata.i[66] = vdata.i[67] = 
+	vdata.i[68] = vdata.i[69] = vdata.i[70] = vdata.i[71] = yscale0*512;
+	vdata.i[72] = vdata.i[73] = vdata.i[74] = vdata.i[75] = 
+	vdata.i[76] = vdata.i[77] = vdata.i[78] = vdata.i[79] = ybleed0*1024;
+	vdata.i[128]= vdata.i[129]= vdata.i[130]= vdata.i[131]= 
+	vdata.i[132]= vdata.i[133]= vdata.i[134]= vdata.i[135]= yscale1*512;
+	vdata.i[136]= vdata.i[137]= vdata.i[138]= vdata.i[139]= 
+	vdata.i[140]= vdata.i[141]= vdata.i[142]= vdata.i[143]= ybleed1*1024;
+	for (x = 0; x < dstWidth; x += 8) {
+	    int i;
+#if 0  // SSE2 routine, 2500us @ 896x672 -- but not fixed to handle 16bpp input
+	    asm("movq ("RSI",%1,4), %%xmm0"
+		: : "S" (in0), "r" ((long)x_source[x  ]));
+	    asm("movq ("RSI",%1,4), %%xmm1"
+		: : "S" (in0), "r" ((long)x_source[x+1]));
+	    asm("movq ("RSI",%1,4), %%xmm2"
+		: : "S" (in0), "r" ((long)x_source[x+2]));
+	    asm("movq ("RSI",%1,4), %%xmm3"
+		: : "S" (in0), "r" ((long)x_source[x+3]));
+	    asm("movq ("RSI",%1,4), %%xmm4"
+		: : "S" (in0), "r" ((long)x_source[x+4]));
+	    asm("movq ("RSI",%1,4), %%xmm5"
+		: : "S" (in0), "r" ((long)x_source[x+5]));
+	    asm("movq ("RSI",%1,4), %%xmm6"
+		: : "S" (in0), "r" ((long)x_source[x+6]));
+	    asm("movq ("RSI",%1,4), %%xmm7"
+		: : "S" (in0), "r" ((long)x_source[x+7]));
+	    asm("punpcklbw %%xmm1, %%xmm0	# XMM0: RGBb,a RGB1,0	\n\
+                 punpcklbw %%xmm3, %%xmm2	# XMM2: RGBd,c RGB3,2	\n\
+                 punpcklbw %%xmm5, %%xmm4	# XMM4: RGBf,e RGB5,4	\n\
+                 punpcklbw %%xmm7, %%xmm6	# XMM6: RGBh,g RGB7,6	\n\
+                 movdqa %%xmm0, %%xmm1		# XMM1: RGBb,a RGB1,0	\n\
+                 movdqa %%xmm4, %%xmm5		# XMM5: RGBf,e RGB5,4	\n\
+                 punpcklwd %%xmm2, %%xmm0	# XMM0: R3-R0G3-G0B3-B0	\n\
+                 punpckhwd %%xmm2, %%xmm1	# XMM1: Rd-RaGd-GaBd-Ba	\n\
+                 punpcklwd %%xmm6, %%xmm4	# XMM4: R7-R4G7-G4B7-B4	\n\
+                 punpckhwd %%xmm6, %%xmm5	# XMM5: Rh-ReGh-GeBh-Be	\n\
+                 movdqa %%xmm0, %%xmm2		# XMM1: R3-R0G3-G0B3-B0	\n\
+                 punpckldq %%xmm4, %%xmm0	# XMM0: G7-G0B7-B0	\n\
+                 punpckhdq %%xmm4, %%xmm2	# XMM1: R7-R0		\n\
+                 movdqa ("RDI"), %%xmm3		# XMM3: 0x80 x 16	\n\
+                 movdqa %%xmm3, %%xmm6		# XMM6: 0x80 x 16	\n\
+                 movdqa %%xmm3, %%xmm7		# XMM7: 0x80 x 16	\n\
+                 punpcklbw %%xmm2, %%xmm3	# XMM3: R7 R6 ... R0	\n\
+                 punpckhbw %%xmm0, %%xmm6	# XMM6: G7 G6 ... G0	\n\
+                 punpcklbw %%xmm0, %%xmm7	# XMM7: B7 B6 ... B0	\n\
+                 movdqa %%xmm3, ("RDI")					\n\
+                 movdqa %%xmm6, 0x10("RDI")				\n\
+                 movdqa %%xmm7, 0x20("RDI")				\n\
+                 movdqa %%xmm1, %%xmm2		# XMM2: Rd-RaGd-GaBd-Ba	\n\
+                 punpckldq %%xmm5, %%xmm1	# XMM0: Gh-GaBh-Ba	\n\
+                 punpckhdq %%xmm5, %%xmm2	# XMM1: Rh-Ra		\n\
+                 movdqa 0x40("RDI"), %%xmm3	# XMM3: 0x80 x 16	\n\
+                 movdqa %%xmm3, %%xmm6		# XMM6: 0x80 x 16	\n\
+                 movdqa %%xmm3, %%xmm7		# XMM7: 0x80 x 16	\n\
+                 punpcklbw %%xmm2, %%xmm3	# XMM3: Rh Rg ... Ra	\n\
+                 punpckhbw %%xmm1, %%xmm6	# XMM6: Gh Gg ... Ga	\n\
+                 punpcklbw %%xmm1, %%xmm7	# XMM7: Bh Bg ... Ba	\n\
+                 movdqa %%xmm3, 0x30("RDI")				\n\
+                 movdqa %%xmm6, 0x40("RDI")				\n\
+                 movdqa %%xmm7, 0x50("RDI")				\n"
+		: /* no outputs */
+		: "D" (&vdata), "m" (vdata)
+	    );
+#endif
+	    for (i = 0; i < 8; i++) {
+#if 1		// simple routine, 3300us @ 896x672
+		asm("\
+                    mov ("RSI",%1,2), %%eax	\n\
+                    mov %%eax, %%edx            \n\
+                    and $0xF81FF81F, %%eax	\n\
+                    and $0x07C007C0, %%edx	\n\
+                    mov %%ah,  1("RDI",%3,2)	\n\
+                    ror $16, %%eax		\n\
+                    mov %%ah, 49("RDI",%3,2)	\n\
+                    shr $3, %%edx		\n\
+                    shl $3, %%eax		\n\
+                    mov %%dl, 17("RDI",%3,2)	\n\
+                    mov %%al, 81("RDI",%3,2)	\n\
+                    shr $16, %%edx		\n\
+                    shr $16, %%eax		\n\
+                    mov %%dl, 65("RDI",%3,2)	\n\
+                    mov %%al, 33("RDI",%3,2)	\n"
+		    : /* no outputs */
+		    : "S" (in0), "r" ((long)x_source[x+i]), "D" (&vdata),
+		      "r" ((long)i), "m" (vdata)
+		    : "eax", "edx"
+		);
+#endif
+		vdata.i[i+48] = x_wgt16_1[x+i];
+		vdata.i[i+56] = x_wgt16_2[x+i];
+		if (vdata.i[128] /*yscale1*/ || vdata.i[136] /*ybleed1*/) {
+		    asm("\
+                        mov ("RSI",%1,4), %%eax		\n\
+                        mov 4("RSI",%1,4), %%edx	\n\
+                        mov %%al, 193("RDI",%3,2)	\n\
+                        mov %%dl, 241("RDI",%3,2)	\n\
+                        mov %%ah, 177("RDI",%3,2)	\n\
+                        mov %%dh, 225("RDI",%3,2)	\n\
+                        bswap %%eax			\n\
+                        bswap %%edx			\n\
+                        mov %%ah, 161("RDI",%3,2)	\n\
+                        mov %%dh, 209("RDI",%3,2)	\n"
+			: /* no outputs */
+			: "S" (in1), "r" ((long)x_source[x+i]), "D" (&vdata),
+#ifdef __x86_64__  // Need a classic register for %ah/%dh to work.
+			  "c" ((long)i),
+#else
+			  "r" ((long)i),
+#endif
+			  "m" (vdata)
+			: "eax", "edx"
+		    );
+		}
+	    }
+	    // 1850us @ 896x672
+	    asm("\
+                # Load input RGB data, and merge RGB values as dictated	\n\
+                # by X weights						\n\
+                movdqa 0x60("RSI"), %%xmm6	# XMM6: Xn0 <<15	\n\
+                movdqa 0x70("RSI"), %%xmm7	# XMM7: Xn1 <<15	\n\
+                movdqa 0x00("RSI"), %%xmm0	# XMM0: Rn0 <<8		\n\
+                pmulhuw %%xmm6, %%xmm0		# XMM0: Rn0 scaled <<7	\n\
+                movdqa 0x10("RSI"), %%xmm1	# XMM1: Gn0 <<8		\n\
+                pmulhuw %%xmm6, %%xmm1		# XMM1: Gn0 scaled <<7	\n\
+                movdqa 0x20("RSI"), %%xmm2	# XMM2: Bn0 <<8		\n\
+                pmulhuw %%xmm6, %%xmm2		# XMM2: Bn0 scaled <<7	\n\
+                movdqa 0x30("RSI"), %%xmm3	# XMM3: Rn1 <<8		\n\
+                pmulhuw %%xmm7, %%xmm3		# XMM3: Rn1 scaled <<7	\n\
+                movdqa 0x40("RSI"), %%xmm4	# XMM4: Gn1 <<8		\n\
+                pmulhuw %%xmm7, %%xmm4		# XMM4: Gn1 scaled <<7	\n\
+                movdqa 0x50("RSI"), %%xmm5	# XMM5: Bn1 <<8		\n\
+                pmulhuw %%xmm7, %%xmm5		# XMM5: Bn1 scaled <<7	\n\
+                paddw %%xmm3, %%xmm0		# XMM0: Rn <<7		\n\
+                paddw %%xmm4, %%xmm1		# XMM1: Gn <<7		\n\
+                paddw %%xmm5, %%xmm2		# XMM2: Bn <<7		\n\
+                # Load Y factor						\n\
+                movdqa 0x80("RSI"), %%xmm7	# XMM7: yscale0 <<9	\n"
+#ifdef SCANLINE_BLEED
+"               # See if we need to do a bleed check			\n\
+                mov 0x90("RSI"), %%eax					\n\
+                test %%eax, %%eax					\n\
+                jz 0f							\n\
+                # Convert RGB to luma for bleeding			\n\
+                movdqa %%xmm0, %%xmm3		# XMM3: Rn <<7		\n\
+                pmulhuw ("RBX"), %%xmm3		# XMM3: YRn <<15	\n\
+                movdqa %%xmm1, %%xmm4		# XMM4: Gn <<7		\n\
+                pmulhuw 0x10("RBX"), %%xmm4	# XMM4: YGn <<15	\n\
+                movdqa %%xmm2, %%xmm5		# XMM5: Bn <<7		\n\
+                pmulhuw 0x20("RBX"), %%xmm5	# XMM5: YBn <<15	\n\
+                paddw %%xmm4, %%xmm3					\n\
+                paddw %%xmm5, %%xmm3		# XMM3: Y <<15		\n\
+                # Calculate bleed factor and add to Y factor		\n\
+                pmulhuw 0x90("RSI"), %%xmm3	# XMM3: bleed <<9	\n\
+                paddw %%xmm3, %%xmm7		# XMM7: new yscale0 <<9	\n"
+#endif
+"0:             # Scale by Y factor					\n\
+                pmulhuw %%xmm7, %%xmm0		# XMM0: new Rn		\n\
+                pmulhuw %%xmm7, %%xmm1		# XMM1: new Gn		\n\
+                pmulhuw %%xmm7, %%xmm2		# XMM2: new Bn		\n\
+                # See if we have a second scanline to process		\n\
+                movzwl 0x90("RSI"), %%eax				\n\
+                test %%eax, %%eax					\n\
+                jz 1f							\n\
+                # We do; first save the current RGB (overwrite the old	\n\
+                # source pixels, as we don't need them anymore)		\n\
+                movdqa %%xmm0, ("RSI")					\n\
+                movdqa %%xmm1, 0x10("RSI")				\n\
+                movdqa %%xmm2, 0x20("RSI")				\n\
+                # Process the second scanline just as above		\n\
+                movdqa 0x60("RSI"), %%xmm6	# XMM6: Xn0 <<15	\n\
+                movdqa 0x70("RSI"), %%xmm7	# XMM7: Xn1 <<15	\n\
+                movdqa 0xA0("RSI"), %%xmm0	# XMM0: Rn0 <<8		\n\
+                pmulhuw %%xmm6, %%xmm0		# XMM0: Rn0 scaled <<7	\n\
+                movdqa 0xB0("RSI"), %%xmm1	# XMM1: Gn0 <<8		\n\
+                pmulhuw %%xmm6, %%xmm1		# XMM1: Gn0 scaled <<7	\n\
+                movdqa 0xC0("RSI"), %%xmm2	# XMM2: Bn0 <<8		\n\
+                pmulhuw %%xmm6, %%xmm2		# XMM2: Bn0 scaled <<7	\n\
+                movdqa 0xD0("RSI"), %%xmm3	# XMM3: Rn1 <<8		\n\
+                pmulhuw %%xmm7, %%xmm3		# XMM3: Rn1 scaled <<7	\n\
+                movdqa 0xE0("RSI"), %%xmm4	# XMM4: Gn1 <<8		\n\
+                pmulhuw %%xmm7, %%xmm4		# XMM4: Gn1 scaled <<7	\n\
+                movdqa 0xF0("RSI"), %%xmm5	# XMM5: Bn1 <<8		\n\
+                pmulhuw %%xmm7, %%xmm5		# XMM5: Bn1 scaled <<7	\n\
+                paddw %%xmm3, %%xmm0		# XMM0: Rn <<7		\n\
+                paddw %%xmm4, %%xmm1		# XMM1: Gn <<7		\n\
+                paddw %%xmm5, %%xmm2		# XMM2: Bn <<7		\n\
+                movdqa 0x100("RSI"), %%xmm7	# XMM7: yscale1 <<9	\n"
+#ifdef SCANLINE_BLEED
+"               mov 0x110("RSI"), %%eax					\n\
+                test %%eax, %%eax					\n\
+                jz 0f							\n\
+                movdqa %%xmm0, %%xmm3		# XMM3: Rn <<7		\n\
+                pmulhuw ("RBX"), %%xmm3		# XMM3: YRn <<15	\n\
+                movdqa %%xmm1, %%xmm4		# XMM4: Gn <<7		\n\
+                pmulhuw 0x10("RBX"), %%xmm4	# XMM4: YGn <<15	\n\
+                movdqa %%xmm2, %%xmm5		# XMM5: Bn <<7		\n\
+                pmulhuw 0x20("RBX"), %%xmm5	# XMM5: YBn <<15	\n\
+                paddw %%xmm4, %%xmm3					\n\
+                paddw %%xmm5, %%xmm3		# XMM3: Y <<15		\n\
+                pmulhuw 0x110("RSI"), %%xmm3	# XMM3: bleed <<9	\n\
+                paddw %%xmm3, %%xmm7		# XMM7: new yscale1 <<9	\n"
+#endif
+"0:             # Scale by Y factor					\n\
+                pmulhuw %%xmm7, %%xmm0		# XMM0: new Rn		\n\
+                pmulhuw %%xmm7, %%xmm1		# XMM1: new Gn		\n\
+                pmulhuw %%xmm7, %%xmm2		# XMM2: new Bn		\n\
+                # Add in the first scanline's values			\n\
+                paddw ("RSI"), %%xmm0					\n\
+                paddw 0x10("RSI"), %%xmm1				\n\
+                paddw 0x20("RSI"), %%xmm2				\n\
+1:              # Interleave and saturate RGB values			\n\
+                pxor %%xmm7, %%xmm7		# XMM7: 0's		\n\
+                movdqa %%xmm0, %%xmm4		# XMM4: Rn		\n\
+                movdqa %%xmm1, %%xmm5		# XMM5: Gn		\n\
+                movdqa %%xmm2, %%xmm6		# XMM6: Bn		\n\
+                punpcklwd %%xmm1, %%xmm2	# Interleave B and G	\n\
+                punpcklwd %%xmm7, %%xmm0	# Interleave R and 0	\n\
+                packuswb %%xmm7, %%xmm2		# Saturate B and G	\n\
+                packuswb %%xmm7, %%xmm0		# Saturate R		\n\
+                punpcklwd %%xmm0, %%xmm2	# Interleave R/G/B	\n\
+                punpckhwd %%xmm5, %%xmm6	# Do it again (high wds)\n\
+                punpckhwd %%xmm7, %%xmm4				\n\
+                packuswb %%xmm7, %%xmm6					\n\
+                packuswb %%xmm7, %%xmm4					\n\
+                punpcklwd %%xmm4, %%xmm6				\n\
+                # Store to destination					\n\
+                movdqu %%xmm2, ("RDI")					\n\
+                movdqu %%xmm6, 16("RDI")				\n"
+                : "=m" (vdata)  /* because it may be modified */
+                : "S" (&vdata), "D" (dstPtr + y*dstPitch + x*Bpp)
+#ifdef SCANLINE_BLEED
+                  , "b" (&cdata), "m" (cdata)
+#endif
+		: "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+		  "xmm6", "xmm7"
+            );
+	}
+#endif  // SLOW_CLEVERNESS or not
+    }
+}
+
 void TVMode (int width, int height)
 {
-    switch (width != 256 && GUI.interpolate != 5 ? 1 : GUI.interpolate)
+    switch (width != 256 && GUI.interpolate != 5 && GUI.interpolate != 6 ? 1 : GUI.interpolate)
     {
     case 2:
 	Super2xSaI ((uint8*)GFX.Screen, GFX.Pitch, GUI.delta_screen, GUI.output_screen,
@@ -2163,6 +2648,11 @@
 	Scale_2xSaI ((uint8*)GFX.Screen, GFX.Pitch, GUI.delta_screen, GUI.output_screen,
 		     GUI.output_pitch, GUI.window_width, GUI.window_height, width, height);
 	break;
+    case 6:
+	Scale_3x ((uint8*)GFX.Screen, GFX.Pitch, GUI.delta_screen, GUI.output_screen,
+		  GUI.output_pitch, width, height,
+		  GUI.window_width, height>240 ? height*3/2 : height*3, GUI.image->bits_per_pixel);
+	break;
     case 1:
     {
 	uint8 *nextLine, *srcPtr, *deltaPtr, *finish;