cSDL/src/gfx/SDL_imageFilter.c

/*

SDL_imageFilter.c: byte-image "filter" routines

Copyright (C) 2001-2012  Andreas Schiffler
Copyright (C) 2013  Sylvain Beucler

This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.

Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:

   1. The origin of this software must not be misrepresented; you must not
   claim that you wrote the original software. If you use this software
   in a product, an acknowledgment in the product documentation would be
   appreciated but is not required.

   2. Altered source versions must be plainly marked as such, and must not be
   misrepresented as being the original software.

   3. This notice may not be removed or altered from any source
   distribution.

Andreas Schiffler -- aschiffler at ferzkopp dot net

*/

/*

Note: Uses inline x86 MMX or ASM optimizations if available and enabled.

Note: Most of the MMX code is based on published routines
by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to
him for his work.

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Use GCC intrinsics if available: they support both i386 and x86_64,
   provide ASM-grade performances, and lift the PUSHA/POPA issues. */
#ifdef __GNUC__
#  ifdef USE_MMX
#    include <mmintrin.h>
#  endif
#endif
#include <SDL_cpuinfo.h>
#include "SDL_imageFilter.h"

/*!
\brief Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.).
*/
#define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))

/* ------ Static variables ----- */

/*!
\brief Static state which enables the use of the MMX routines. Enabled by default
*/
static int SDL_imageFilterUseMMX = 1;

/* Detect GCC */
#if defined(__GNUC__)
#define GCC__
#endif

/*!
\brief MMX detection routine (with override flag).

\returns 1 of MMX was detected, 0 otherwise.
*/
int SDL_imageFilterMMXdetect(void)
{
	/* Check override flag */
	if (SDL_imageFilterUseMMX == 0) {
		return (0);
	}

        return SDL_HasMMX();
}

/*!
\brief Disable MMX check for filter functions and and force to use non-MMX C based code.
*/
void SDL_imageFilterMMXoff()
{
	SDL_imageFilterUseMMX = 0;
}

/*!
\brief Enable MMX check for filter functions and use MMX code if available.
*/
void SDL_imageFilterMMXon()
{
	SDL_imageFilterUseMMX = 1;
}

/* ------------------------------------------------------------------------------------ */

/*!
\brief Internal MMX Filter using Add: D = saturation255(S1 + S2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax, Src1	/* load Src1 address into eax */
			mov ebx, Src2	/* load Src2 address into ebx */
			mov edi, Dest	/* load Dest address into edi */
			mov ecx, SrcLength	/* load loop counter (SIZE) into ecx */
			shr ecx, 3	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16	/* 16 byte alignment of the loop entry */
L1010:
		movq mm1, [eax]	/* load 8 bytes from Src1 into mm1 */
		paddusb mm1, [ebx]	/* mm1=Src1+Src2 (add 8 bytes with saturation) */
		movq [edi], mm1	/* store result in Dest */
			add eax, 8	/* increase Src1, Src2 and Dest  */
			add ebx, 8	/* register pointers by 8 */
			add edi, 8
			dec ecx	/* decrease loop counter */
			jnz L1010	/* check loop termination, proceed if required */
			emms /* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mSrc2 = (__m64*)Src2;
	__m64 *mDest = (__m64*)Dest;
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_paddusb(*mSrc1, *mSrc2);	/* Src1+Src2 (add 8 bytes with saturation) */
		mSrc1++;
		mSrc2++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using Add: D = saturation255(S1 + S2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		/* Use MMX assembly routine */
		SDL_imageFilterAddMMX(Src1, Src2, Dest, length);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			cursrc2 = &Src2[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		result = (int) *cursrc1 + (int) *cursrc2;
		if (result > 255)
			result = 255;
		*curdst = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using Mean: D = S1/2 + S2/2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.
\param Mask Mask array containing 8 bytes with 0x7F value.
]
\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
						   unsigned char *Mask)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov edx, Mask /* load Mask address into edx */
			movq mm0, [edx] /* load Mask into mm0 */
		mov eax, Src1 /* load Src1 address into eax */
			mov ebx, Src2 /* load Src2 address into ebx */
			mov edi, Dest /* load Dest address into edi */
			mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16	/* 16 byte alignment of the loop entry */
L21011:
		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
		/* --- Byte shift via Word shift --- */
		psrlw mm1, 1 	/* shift 4 WORDS of mm1 1 bit to the right */
			psrlw mm2, 1 	/* shift 4 WORDS of mm2 1 bit to the right */
			pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */
			/* byte     0x0f, 0xdb, 0xc8 */
			pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */
			/* byte     0x0f, 0xdb, 0xd0 */
			paddusb mm1,  mm2 	/* mm1=mm1+mm2 (add 8 bytes with saturation) */
			movq [edi],  mm1 	/* store result in Dest */
			add eax,  8 	/* increase Src1, Src2 and Dest  */
			add ebx,  8 	/* register pointers by 8 */
			add edi,  8
			dec ecx 	/* decrease loop counter */
			jnz L21011	/* check loop termination, proceed if required */
			emms	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mSrc2 = (__m64*)Src2;
	__m64 *mDest = (__m64*)Dest;
	__m64 *mMask = (__m64*)Mask;
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm1 = *mSrc1,
		      mm2 = *mSrc2;
		mm1 = _m_psrlwi(mm1, 1);	/* shift 4 WORDS of mm1 1 bit to the right */
		mm2 = _m_psrlwi(mm2, 1);	/* shift 4 WORDS of mm2 1 bit to the right */
		mm1 = _m_pand(mm1, *mMask);	/* apply Mask to 8 BYTES of mm1 */
		mm2 = _m_pand(mm2, *mMask);	/* apply Mask to 8 BYTES of mm2 */
		*mDest = _m_paddusb(mm1, mm2);	/* mm1+mm2 (add 8 bytes with saturation) */
		mSrc1++;
		mSrc2++;
		mDest++;
	}
	_m_empty();				/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using Mean: D = S1/2 + S2/2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
		/* MMX routine */
		SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			cursrc2 = &Src2[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
		*curdst = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using Sub: D = saturation0(S1 - S2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax,  Src1 	/* load Src1 address into eax */
			mov ebx,  Src2 	/* load Src2 address into ebx */
			mov edi,  Dest 	/* load Dest address into edi */
			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16 /* 16 byte alignment of the loop entry */
L1012:
		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
		movq [edi],  mm1 	/* store result in Dest */
			add eax, 8 	/* increase Src1, Src2 and Dest  */
			add ebx, 8 	/* register pointers by 8 */
			add edi, 8
			dec ecx	/* decrease loop counter */
			jnz L1012	/* check loop termination, proceed if required */
			emms /* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mSrc2 = (__m64*)Src2;
	__m64 *mDest = (__m64*)Dest;
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_psubusb(*mSrc1, *mSrc2);	/* Src1-Src2 (sub 8 bytes with saturation) */
		mSrc1++;
		mSrc2++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using Sub: D = saturation0(S1 - S2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
		/* MMX routine */
		SDL_imageFilterSubMMX(Src1, Src2, Dest, length);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			cursrc2 = &Src2[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		result = (int) *cursrc1 - (int) *cursrc2;
		if (result < 0)
			result = 0;
		*curdst = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using AbsDiff: D = | S1 - S2 |

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax, Src1  	/* load Src1 address into eax */
			mov ebx, Src2 	/* load Src2 address into ebx */
			mov edi, Dest 	/* load Dest address into edi */
			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16	/* 16 byte alignment of the loop entry */
L1013:
		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
		psubusb mm2,  [eax] 	/* mm2=Src2-Src1 (sub 8 bytes with saturation) */
		por mm1,  mm2 	/* combine both mm2 and mm1 results */
			movq [edi],  mm1 	/* store result in Dest */
			add eax, 8 	/* increase Src1, Src2 and Dest  */
			add ebx, 8 	/* register pointers by 8 */
			add edi, 8
			dec ecx 	/* decrease loop counter */
			jnz L1013    	/* check loop termination, proceed if required */
			emms         /* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mSrc2 = (__m64*)Src2;
	__m64 *mDest = (__m64*)Dest;
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm1 = _m_psubusb(*mSrc2, *mSrc1);	/* Src1-Src2 (sub 8 bytes with saturation) */
		__m64 mm2 = _m_psubusb(*mSrc1, *mSrc2);	/* Src2-Src1 (sub 8 bytes with saturation) */
		*mDest = _m_por(mm1, mm2);		/* combine both mm2 and mm1 results */
		mSrc1++;
		mSrc2++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using AbsDiff: D = | S1 - S2 |

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
		/* MMX routine */
		SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			cursrc2 = &Src2[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		result = abs((int) *cursrc1 - (int) *cursrc2);
		*curdst = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using Mult: D = saturation255(S1 * S2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax, Src1   /* load Src1 address into eax */
			mov ebx, Src2   /* load Src2 address into ebx */
			mov edi, Dest   /* load Dest address into edi */
			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
			shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */
			pxor mm0, mm0   /* zero mm0 register */
			align 16      	/* 16 byte alignment of the loop entry */
L1014:
		movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */
		movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */
		movq mm2, mm1   /* copy mm1 into mm2 */
			movq mm4, mm3   /* copy mm3 into mm4  */
			punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */
			punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */
			punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */
			punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */
			pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */
			pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */
			/* Take abs value of the results (signed words) */
			movq mm5, mm1   /* copy mm1 into mm5 */
			movq mm6, mm2   /* copy mm2 into mm6 */
			psraw mm5, 15   /* fill mm5 words with word sign bit */
			psraw mm6, 15   /* fill mm6 words with word sign bit */
			pxor mm1, mm5   /* take 1's compliment of only neg. words */
			pxor mm2, mm6   /* take 1's compliment of only neg. words */
			psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */
			psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */
			packuswb mm1, mm2   /* pack words back into bytes with saturation */
			movq [edi], mm1   /* store result in Dest */
			add eax, 8   /* increase Src1, Src2 and Dest  */
			add ebx, 8   /* register pointers by 8 */
			add edi, 8
			dec ecx 	/* decrease loop counter */
			jnz L1014	/* check loop termination, proceed if required */
			emms /* exit MMX state */
			popa
	}
#else
	/* i386 ASM with constraints: */
	/* asm volatile ( */
	/* 	"shr $3, %%ecx \n\t"	/\* counter/8 (MMX loads 8 bytes at a time) *\/ */
	/* 	"pxor      %%mm0, %%mm0 \n\t"	/\* zero mm0 register *\/ */
	/* 	".align 16       \n\t"	/\* 16 byte alignment of the loop entry *\/ */
	/* 	"1: movq (%%eax), %%mm1 \n\t"     /\* load 8 bytes from Src1 into mm1 *\/ */
	/* 	"movq    (%%ebx), %%mm3 \n\t"	/\* load 8 bytes from Src2 into mm3 *\/ */
	/* 	"movq      %%mm1, %%mm2 \n\t"	/\* copy mm1 into mm2 *\/ */
	/* 	"movq      %%mm3, %%mm4 \n\t"	/\* copy mm3 into mm4  *\/ */
	/* 	"punpcklbw %%mm0, %%mm1 \n\t"	/\* unpack low  bytes of Src1 into words *\/ */
	/* 	"punpckhbw %%mm0, %%mm2 \n\t"	/\* unpack high bytes of Src1 into words *\/ */
	/* 	"punpcklbw %%mm0, %%mm3 \n\t"	/\* unpack low  bytes of Src2 into words *\/ */
	/* 	"punpckhbw %%mm0, %%mm4 \n\t"	/\* unpack high bytes of Src2 into words *\/ */
	/* 	"pmullw    %%mm3, %%mm1 \n\t"	/\* mul low  bytes of Src1 and Src2  *\/ */
	/* 	"pmullw    %%mm4, %%mm2 \n\t"	/\* mul high bytes of Src1 and Src2 *\/ */
	/* 	/\* Take abs value of the results (signed words) *\/ */
	/* 	"movq      %%mm1, %%mm5 \n\t"	/\* copy mm1 into mm5 *\/ */
	/* 	"movq      %%mm2, %%mm6 \n\t"	/\* copy mm2 into mm6 *\/ */
	/* 	"psraw       $15, %%mm5 \n\t"	/\* fill mm5 words with word sign bit *\/ */
	/* 	"psraw       $15, %%mm6 \n\t"	/\* fill mm6 words with word sign bit *\/ */
	/* 	"pxor      %%mm5, %%mm1 \n\t"	/\* take 1's compliment of only neg. words *\/ */
	/* 	"pxor      %%mm6, %%mm2 \n\t"	/\* take 1's compliment of only neg. words *\/ */
	/* 	"psubsw    %%mm5, %%mm1 \n\t"	/\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
	/* 	"psubsw    %%mm6, %%mm2 \n\t"	/\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
	/* 	"packuswb  %%mm2, %%mm1 \n\t"	/\* pack words back into bytes with saturation *\/ */
	/* 	"movq    %%mm1, (%%edi) \n\t"	/\* store result in Dest *\/ */
	/* 	"add $8, %%eax \n\t"	/\* increase Src1, Src2 and Dest  *\/ */
	/* 	"add $8, %%ebx \n\t"	/\* register pointers by 8 *\/ */
	/* 	"add $8, %%edi \n\t" */
	/* 	"dec %%ecx     \n\t"	/\* decrease loop counter *\/ */
	/* 	"jnz 1b        \n\t"	/\* check loop termination, proceed if required *\/ */
	/* 	"emms          \n\t"	/\* exit MMX state *\/ */
	/* 	: "+a" (Src1),		/\* load Src1 address into rax, modified by the loop *\/ */
	/* 	  "+b" (Src2),		/\* load Src2 address into rbx, modified by the loop *\/ */
	/* 	  "+c" (SrcLength),	/\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
	/* 	  "+D" (Dest)		/\* load Dest address into rdi, modified by the loop *\/ */
	/* 	: */
	/* 	: "memory",		/\* *Dest is modified *\/ */
        /*           "mm0","mm1","mm2","mm3","mm4","mm5","mm6"	/\* registers modified *\/ */
	/* ); */

	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mSrc2 = (__m64*)Src2;
	__m64 *mDest = (__m64*)Dest;
	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
		mm5 = _m_psrawi(mm1, 15);		/* fill mm5 words with word sign bit */
		mm6 = _m_psrawi(mm2, 15);		/* fill mm6 words with word sign bit */
		mm1 = _m_pxor(mm1, mm5);		/* take 1's compliment of only neg. words */
		mm2 = _m_pxor(mm2, mm6);		/* take 1's compliment of only neg. words */
		mm1 = _m_psubsw(mm1, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
		mm2 = _m_psubsw(mm2, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
		mSrc1++;
		mSrc2++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using Mult: D = saturation255(S1 * S2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
		/* MMX routine */
		SDL_imageFilterMultMMX(Src1, Src2, Dest, length);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			cursrc2 = &Src2[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {

		/* NOTE: this is probably wrong - dunno what the MMX code does */

		result = (int) *cursrc1 * (int) *cursrc2;
		if (result > 255)
			result = 255;
		*curdst = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal ASM Filter using MultNor: D = S1 * S2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov edx, Src1   /* load Src1 address into edx */
			mov esi, Src2   /* load Src2 address into esi */
			mov edi, Dest   /* load Dest address into edi */
			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
			align 16 	/* 16 byte alignment of the loop entry */
L10141:
		mov al, [edx]   /* load a byte from Src1 */
		mul [esi] 	/* mul with a byte from Src2 */
		mov [edi], al   /* move a byte result to Dest */
			inc edx 	/* increment Src1, Src2, Dest */
			inc esi 		/* pointer registers by one */
			inc edi
			dec ecx	/* decrease loop counter */
			jnz L10141  	/* check loop termination, proceed if required */
			popa
	}
#else
	/* Note: ~5% gain on i386, less efficient than C on x86_64 */
	/* Also depends on whether this function is static (?!) */
	asm volatile (
		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
#  if defined(i386)
		"1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */
		"mulb (%%esi)       \n\t"	/* mul with a byte from Src2 */
		"mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */
		"inc %%edx \n\t"		/* increment Src1, Src2, Dest */
		"inc %%esi \n\t"		/* pointer registers by one */
		"inc %%edi \n\t"
		"dec %%ecx      \n\t"	/* decrease loop counter */
#  elif defined(__x86_64__)
		"1:mov  (%%rdx), %%al \n\t"      /* load a byte from Src1 */
		"mulb (%%rsi)       \n\t"	/* mul with a byte from Src2 */
		"mov %%al, (%%rdi)  \n\t"       /* move a byte result to Dest */
		"inc %%rdx \n\t"		/* increment Src1, Src2, Dest */
		"inc %%rsi \n\t"		/* pointer registers by one */
		"inc %%rdi \n\t"
		"dec %%rcx      \n\t"	/* decrease loop counter */
#  endif
		"jnz 1b         \n\t"	/* check loop termination, proceed if required */
		: "+d" (Src1),		/* load Src1 address into edx */
		  "+S" (Src2),		/* load Src2 address into esi */
		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
		  "+D" (Dest)		/* load Dest address into edi */
		:
		: "memory", "rax"
		);
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using MultNor: D = S1 * S2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if (SDL_imageFilterMMXdetect()) {
		if (length > 0) {
			/* ASM routine */
			SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);

			/* Check for unaligned bytes */
			if ((length & 7) > 0) {
				/* Setup to process unaligned bytes */
				istart = length & 0xfffffff8;
				cursrc1 = &Src1[istart];
				cursrc2 = &Src2[istart];
				curdst = &Dest[istart];
			} else {
				/* No unaligned bytes - we are done */
				return (0);
			}
		} else {
			/* No bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		*curdst = (int)*cursrc1 * (int)*cursrc2;  // (int) for efficiency
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using MultDivby2: D = saturation255(S1/2 * S2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax, Src1   	/* load Src1 address into eax */
			mov ebx, Src2   	/* load Src2 address into ebx */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			pxor mm0,  mm0 	/* zero mm0 register */
			align 16          	/* 16 byte alignment of the loop entry */
L1015:
		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
		movq mm3,  [ebx] 	/* load 8 bytes from Src2 into mm3 */
		movq mm2,  mm1 	/* copy mm1 into mm2 */
			movq mm4,  mm3 	/* copy mm3 into mm4  */
			punpcklbw mm1,  mm0 	/* unpack low  bytes of Src1 into words */
			punpckhbw mm2,  mm0 	/* unpack high bytes of Src1 into words */
			punpcklbw mm3,  mm0 	/* unpack low  bytes of Src2 into words */
			punpckhbw mm4,  mm0 	/* unpack high bytes of Src2 into words */
			psrlw mm1,  1 	/* divide mm1 words by 2, Src1 low bytes */
			psrlw mm2,  1 	/* divide mm2 words by 2, Src1 high bytes */
			pmullw mm1,  mm3 	/* mul low  bytes of Src1 and Src2  */
			pmullw mm2,  mm4 	/* mul high bytes of Src1 and Src2 */
			packuswb mm1,  mm2 	/* pack words back into bytes with saturation */
			movq [edi],  mm1 	/* store result in Dest */
			add eax,  8 	/* increase Src1, Src2 and Dest  */
			add ebx,  8 	/* register pointers by 8 */
			add edi,  8
			dec ecx        	/* decrease loop counter */
			jnz L1015       	/* check loop termination, proceed if required */
			emms             	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mSrc2 = (__m64*)Src2;
	__m64 *mDest = (__m64*)Dest;
	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
		mm1 = _m_psrlwi(mm1, 1);		/* divide mm1 words by 2, Src1 low bytes */
		mm2 = _m_psrlwi(mm2, 1);		/* divide mm2 words by 2, Src1 high bytes */
		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
		mSrc1++;
		mSrc2++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using MultDivby2: D = saturation255(S1/2 * S2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
		/* MMX routine */
		SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			cursrc2 = &Src2[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		result = ((int) *cursrc1 / 2) * (int) *cursrc2;
		if (result > 255)
			result = 255;
		*curdst = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using MultDivby4: D = saturation255(S1/2 * S2/2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax, Src1   	/* load Src1 address into eax */
			mov ebx, Src2   	/* load Src2 address into ebx */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			pxor mm0, mm0   	/* zero mm0 register */
			align 16          	/* 16 byte alignment of the loop entry */
L1016:
		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
		movq mm3, [ebx]   	/* load 8 bytes from Src2 into mm3 */
		movq mm2, mm1   	/* copy mm1 into mm2 */
			movq mm4, mm3   	/* copy mm3 into mm4  */
			punpcklbw mm1, mm0   	/* unpack low  bytes of Src1 into words */
			punpckhbw mm2, mm0   	/* unpack high bytes of Src1 into words */
			punpcklbw mm3, mm0   	/* unpack low  bytes of Src2 into words */
			punpckhbw mm4, mm0   	/* unpack high bytes of Src2 into words */
			psrlw mm1, 1   	/* divide mm1 words by 2, Src1 low bytes */
			psrlw mm2, 1   	/* divide mm2 words by 2, Src1 high bytes */
			psrlw mm3, 1   	/* divide mm3 words by 2, Src2 low bytes */
			psrlw mm4, 1   	/* divide mm4 words by 2, Src2 high bytes */
			pmullw mm1, mm3   	/* mul low  bytes of Src1 and Src2  */
			pmullw mm2, mm4   	/* mul high bytes of Src1 and Src2 */
			packuswb mm1, mm2   	/* pack words back into bytes with saturation */
			movq [edi], mm1   	/* store result in Dest */
			add eax, 8   	/* increase Src1, Src2 and Dest  */
			add ebx, 8   	/* register pointers by 8 */
			add edi,  8
			dec ecx        	/* decrease loop counter */
			jnz L1016       	/* check loop termination, proceed if required */
			emms             	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mSrc2 = (__m64*)Src2;
	__m64 *mDest = (__m64*)Dest;
	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
		mm1 = _m_psrlwi(mm1, 1);		/* divide mm1 words by 2, Src1 low bytes */
		mm2 = _m_psrlwi(mm2, 1);		/* divide mm2 words by 2, Src1 high bytes */
		mm3 = _m_psrlwi(mm3, 1);		/* divide mm3 words by 2, Src2 low bytes */
		mm4 = _m_psrlwi(mm4, 1);		/* divide mm4 words by 2, Src2 high bytes */
		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
		mSrc1++;
		mSrc2++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using MultDivby4: D = saturation255(S1/2 * S2/2)

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
		/* MMX routine */
		SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			cursrc2 = &Src2[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
		if (result > 255)
			result = 255;
		*curdst = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using BitAnd: D = S1 & S2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax, Src1   	/* load Src1 address into eax */
			mov ebx, Src2   	/* load Src2 address into ebx */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16          	/* 16 byte alignment of the loop entry */
L1017:
		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
		pand mm1, [ebx]   	/* mm1=Src1&Src2 */
		movq [edi], mm1   	/* store result in Dest */
			add eax, 8   	/* increase Src1, Src2 and Dest  */
			add ebx, 8   	/* register pointers by 8 */
			add edi, 8
			dec ecx        	/* decrease loop counter */
			jnz L1017       	/* check loop termination, proceed if required */
			emms             	/* exit MMX state */
			popa
	}
#else
	/* x86_64 ASM with constraints: */
	/* asm volatile ( */
	/* 	"shr $3, %%rcx \n\t"	/\* counter/8 (MMX loads 8 bytes at a time) *\/ */
	/* 	".align 16       \n\t"	/\* 16 byte alignment of the loop entry *\/ */
	/* 	"1: movq (%%rax), %%mm1 \n\t"	/\* load 8 bytes from Src1 into mm1 *\/ */
	/* 	"pand    (%%rbx), %%mm1 \n\t"	/\* mm1=Src1&Src2 *\/ */
	/* 	"movq    %%mm1, (%%rdi) \n\t"	/\* store result in Dest *\/ */
	/* 	"add $8, %%rax \n\t"	/\* increase Src1, Src2 and Dest  *\/ */
	/* 	"add $8, %%rbx \n\t"	/\* register pointers by 8 *\/ */
	/* 	"add $8, %%rdi \n\t" */
	/* 	"dec %%rcx     \n\t"	/\* decrease loop counter *\/ */
	/* 	"jnz 1b        \n\t"	/\* check loop termination, proceed if required *\/ */
	/* 	"emms          \n\t"	/\* exit MMX state *\/ */
	/* 	: "+a" (Src1),		/\* load Src1 address into rax, modified by the loop *\/ */
	/* 	  "+b" (Src2),		/\* load Src2 address into rbx, modified by the loop *\/ */
	/* 	  "+c" (SrcLength),	/\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
	/* 	  "+D" (Dest)		/\* load Dest address into rdi, modified by the loop *\/ */
	/* 	: */
	/* 	: "memory",		/\* *Dest is modified *\/ */
        /*           "mm1"			/\* register mm1 modified *\/ */
	/* ); */

	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mSrc2 = (__m64*)Src2;
	__m64 *mDest = (__m64*)Dest;
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_pand(*mSrc1, *mSrc2);	/* Src1&Src2 */
		mSrc1++;
		mSrc2++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using BitAnd: D = S1 & S2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
		/*  if (length > 7) { */
		/* Call MMX routine */

		SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {

			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			cursrc2 = &Src2[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		*curdst = (*cursrc1) & (*cursrc2);
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using BitOr: D = S1 | S2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax, Src1   	/* load Src1 address into eax */
			mov ebx, Src2   	/* load Src2 address into ebx */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16          	/* 16 byte alignment of the loop entry */
L91017:
		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
		por mm1, [ebx]   	/* mm1=Src1|Src2 */
		movq [edi], mm1   	/* store result in Dest */
			add eax, 8   	/* increase Src1, Src2 and Dest  */
			add ebx, 8   	/* register pointers by 8 */
			add edi,  8
			dec ecx        	/* decrease loop counter */
			jnz L91017      	/* check loop termination, proceed if required */
			emms             	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mSrc2 = (__m64*)Src2;
	__m64 *mDest = (__m64*)Dest;
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_por(*mSrc1, *mSrc2);	/* Src1|Src2 */
		mSrc1++;
		mSrc2++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using BitOr: D = S1 | S2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		/* MMX routine */
		SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			cursrc2 = &Src2[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		cursrc2 = Src2;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		*curdst = *cursrc1 | *cursrc2;
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}
	return (0);
}

/*!
\brief Internal ASM Filter using Div: D = S1 / S2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov edx, Src1   	/* load Src1 address into edx */
			mov esi, Src2   	/* load Src2 address into esi */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
			align 16        	/* 16 byte alignment of the loop entry */
L10191:
		mov bl, [esi]   	/* load a byte from Src2 */
		cmp bl, 0   	/* check if it zero */
			jnz L10192
			mov [edi], 255   	/* division by zero = 255 !!! */
			jmp  L10193
L10192:
		xor ah, ah   	/* prepare AX, zero AH register */
			mov al, [edx]   	/* load a byte from Src1 into AL */
		div   bl             	/* divide AL by BL */
			mov [edi], al   	/* move a byte result to Dest */
L10193:
		inc edx    	/* increment Src1, Src2, Dest */
			inc esi    		/* pointer registers by one */
			inc edi
			dec ecx       	/* decrease loop counter */
			jnz L10191     	/* check loop termination, proceed if required */
			popa
	}
#else
	/* Note: ~15% gain on i386, less efficient than C on x86_64 */
	/* Also depends on whether the function is static (?!) */
	/* Also depends on whether we work on malloc() or static char[] */
	asm volatile (
#  if defined(i386)
		"pushl %%ebx \n\t"		/* %ebx may be the PIC register.  */
		".align 16     \n\t"		/* 16 byte alignment of the loop entry */
		"1: mov (%%esi), %%bl  \n\t"	/* load a byte from Src2 */
		"cmp       $0, %%bl    \n\t"	/* check if it zero */
		"jnz 2f                \n\t"
		"movb  $255, (%%edi)   \n\t"	/* division by zero = 255 !!! */
		"jmp 3f                \n\t"
		"2: xor %%ah, %%ah     \n\t"	/* prepare AX, zero AH register */
		"mov   (%%edx), %%al   \n\t"	/* load a byte from Src1 into AL */
		"div   %%bl            \n\t"	/* divide AL by BL */
		"mov   %%al, (%%edi)   \n\t"	/* move a byte result to Dest */
		"3: inc %%edx          \n\t"	/* increment Src1, Src2, Dest */
		"inc %%esi \n\t"		/* pointer registers by one */
		"inc %%edi \n\t"
		"dec %%ecx \n\t"		/* decrease loop counter */
		"jnz 1b    \n\t"		/* check loop termination, proceed if required */
		"popl %%ebx \n\t"		/* restore %ebx */
		: "+d" (Src1),		/* load Src1 address into edx */
		  "+S" (Src2),		/* load Src2 address into esi */
		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
		  "+D" (Dest)		/* load Dest address into edi */
		:
		: "memory", "rax"
#  elif defined(__x86_64__)
		".align 16     \n\t"		/* 16 byte alignment of the loop entry */
		"1: mov (%%rsi), %%bl  \n\t"	/* load a byte from Src2 */
		"cmp       $0, %%bl    \n\t"	/* check if it zero */
		"jnz 2f                \n\t"
		"movb  $255, (%%rdi)   \n\t"	/* division by zero = 255 !!! */
		"jmp 3f                \n\t"
		"2: xor %%ah, %%ah     \n\t"	/* prepare AX, zero AH register */
		"mov   (%%rdx), %%al   \n\t"	/* load a byte from Src1 into AL */
		"div   %%bl            \n\t"	/* divide AL by BL */
		"mov   %%al, (%%rdi)   \n\t"	/* move a byte result to Dest */
		"3: inc %%rdx          \n\t"	/* increment Src1, Src2, Dest */
		"inc %%rsi \n\t"		/* pointer registers by one */
		"inc %%rdi \n\t"
		"dec %%rcx \n\t"		/* decrease loop counter */
		"jnz 1b    \n\t"		/* check loop termination, proceed if required */
		: "+d" (Src1),		/* load Src1 address into edx */
		  "+S" (Src2),		/* load Src2 address into esi */
		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
		  "+D" (Dest)		/* load Dest address into edi */
		:
		: "memory", "rax", "rbx"
#  endif
		);
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using Div: D = S1 / S2

\param Src1 Pointer to the start of the first source byte array (S1).
\param Src2 Pointer to the start of the second source byte array (S2).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *cursrc2, *curdst;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if (SDL_imageFilterMMXdetect()) {
		if (length > 0) {
			/* Call ASM routine */
			SDL_imageFilterDivASM(Src1, Src2, Dest, length);

			/* Never unaligned bytes - we are done */
			return (0);
		} else {
			return (-1);
		}
	}

	/* Setup to process whole image */
	istart = 0;
	cursrc1 = Src1;
	cursrc2 = Src2;
	curdst = Dest;

	/* C routine to process image */
	/* for (i = istart; i < length; i++) { */
	/* 	if (*cursrc2 == 0) { */
	/* 		*curdst = 255; */
	/* 	} else { */
	/* 		result = (int) *cursrc1 / (int) *cursrc2; */
	/* 		*curdst = (unsigned char) result; */
	/* 	} */
	/* 	/\* Advance pointers *\/ */
	/* 	cursrc1++; */
	/* 	cursrc2++; */
	/* 	curdst++; */
	/* } */
	for (i = istart; i < length; i++) {
		if (*cursrc2 == 0) {
			*curdst = 255;
		} else {
			*curdst = (int)*cursrc1 / (int)*cursrc2;  // (int) for efficiency
		}
		/* Advance pointers */
		cursrc1++;
		cursrc2++;
		curdst++;
	}

	return (0);
}

/* ------------------------------------------------------------------------------------ */

/*!
\brief Internal MMX Filter using BitNegation: D = !S

\param Src1 Pointer to the start of the source byte array (S1).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16          	/* 16 byte alignment of the loop entry */
L91117:
		movq mm0, [eax]   	/* load 8 bytes from Src1 into mm1 */
		pxor mm0, mm1   	/* negate mm0 by xoring with mm1 */
			movq [edi], mm0   	/* store result in Dest */
			add eax, 8   	/* increase Src1, Src2 and Dest  */
			add edi,  8
			dec ecx        	/* decrease loop counter */
			jnz L91117      	/* check loop termination, proceed if required */
			emms             	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
        __m64 mm1;
	mm1 = _m_pcmpeqb(mm1, mm1);		/* generate all 1's in mm1 */
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_pxor(*mSrc1, mm1);	/* negate mm0 by xoring with mm1 */
		mSrc1++;
		mDest++;
	}
	_m_empty();				/* clean MMX state */

#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using BitNegation: D = !S

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *curdst;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
		/* MMX routine */
		SDL_imageFilterBitNegationMMX(Src1, Dest, length);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdst = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdst = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		*curdst = ~(*cursrc1);
		/* Advance pointers */
		cursrc1++;
		curdst++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using AddByte: D = saturation255(S + C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param C Constant value to add (C).

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			/* ** Duplicate C in 8 bytes of MM1 ** */
			mov al, C   	/* load C into AL */
			mov ah, al   	/* copy AL into AH */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm1, eax   	/* copy EAX into MM1 */
			movd mm2, eax   	/* copy EAX into MM2 */
			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L1021:
		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
			movq [edi], mm0   	/* store result in Dest */
			add eax, 8   	/* increase Dest register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L1021    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	/* Duplicate C in 8 bytes of MM1 */
	int i;
	memset(&i, C, 4);
	__m64 mm1 = _m_from_int(i);
	__m64 mm2 = _m_from_int(i);
	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_paddusb(*mSrc1, mm1);	/* Src1+C (add 8 bytes with saturation) */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using AddByte: D = saturation255(S + C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param C Constant value to add (C).


\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
{
	unsigned int i, istart;
	int iC;
	unsigned char *cursrc1, *curdest;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	/* Special case: C==0 */
	if (C == 0) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		/* MMX routine */
		SDL_imageFilterAddByteMMX(Src1, Dest, length, C);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	iC = (int) C;
	for (i = istart; i < length; i++) {
		result = (int) *cursrc1 + iC;
		if (result > 255)
			result = 255;
		*curdest = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}
	return (0);
}

/*!
\brief Internal MMX Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param C Constant to add (C).
\param D Byteorder-swapped constant to add (Cs).

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
			mov eax, C   	/* load C into EAX */
			movd mm1, eax   	/* copy EAX into MM1 */
			mov eax, D   	/* load D into EAX */
			movd mm2, eax   	/* copy EAX into MM2 */
			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L11023:
		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
			movq [edi],  mm0 	/* store result in SrcDest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L11023    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	/* Duplicate (int)C in 8 bytes of MM1 */
	__m64 mm1 = _m_from_int(C);
	__m64 mm2 = _m_from_int(C);
	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_paddusb(*mSrc1, mm1);	/* Src1+C (add 8 bytes with saturation) */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param C Constant to add (C).

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
{
	unsigned int i, j, istart, D;
	int iC[4];
	unsigned char *cursrc1;
	unsigned char *curdest;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	/* Special case: C==0 */
	if (C == 0) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		/* MMX routine */
		D=SWAP_32(C);
		SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process bytes */
	iC[3] = (int) ((C >> 24) & 0xff);
	iC[2] = (int) ((C >> 16) & 0xff);
	iC[1] = (int) ((C >>  8) & 0xff);
	iC[0] = (int) ((C >>  0) & 0xff);
	for (i = istart; i < length; i += 4) {
		for (j = 0; j < 4; j++) {
			if ((i+j)<length) {
				result = (int) *cursrc1 + iC[j];
				if (result > 255) result = 255;
				*curdest = (unsigned char) result;
				/* Advance pointers */
				cursrc1++;
				curdest++;
			}
		}
	}
	return (0);
}

/*!
\brief Internal MMX Filter using AddByteToHalf: D = saturation255(S/2 + C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param C Constant to add (C).
\param Mask Pointer to 8 mask bytes of value 0x7F.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
									unsigned char *Mask)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			/* ** Duplicate C in 8 bytes of MM1 ** */
			mov al, C   	/* load C into AL */
			mov ah, al   	/* copy AL into AH */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm1, eax   	/* copy EAX into MM1 */
			movd mm2, eax   	/* copy EAX into MM2 */
			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
			mov edx, Mask   	/* load Mask address into edx */
			movq mm0, [edx]   	/* load Mask into mm0 */
		mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L1022:
		movq mm2, [eax]   	/* load 8 bytes from Src1 into MM2 */
		psrlw mm2, 1   	/* shift 4 WORDS of MM2 1 bit to the right */
			pand mm2, mm0        // apply Mask to 8 BYTES of MM2 */
			paddusb mm2,  mm1 	/* MM2=SrcDest+C (add 8 bytes with saturation) */
			movq [edi], mm2   	/* store result in Dest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L1022    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	__m64 *mMask = (__m64*)Mask;
	/* Duplicate C in 8 bytes of MM1 */
	int i;
	memset(&i, C, 4);
	__m64 mm1 = _m_from_int(i);
	__m64 mm2 = _m_from_int(i);
	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm2 = _m_psrlwi(*mSrc1, 1);	/* shift 4 WORDS of MM2 1 bit to the right */
		mm2 = _m_pand(mm2, *mMask);		/* apply Mask to 8 BYTES of MM2 */
							/* byte     0x0f, 0xdb, 0xd0 */
		*mDest = _m_paddusb(mm1, mm2);		/* Src1+C (add 8 bytes with saturation) */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using AddByteToHalf: D = saturation255(S/2 + C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param C Constant to add (C).

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
{
	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
	unsigned int i, istart;
	int iC;
	unsigned char *cursrc1;
	unsigned char *curdest;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		/* MMX routine */
		SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	iC = (int) C;
	for (i = istart; i < length; i++) {
		result = (int) (*cursrc1 / 2) + iC;
		if (result > 255)
			result = 255;
		*curdest = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using SubByte: D = saturation0(S - C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param C Constant to subtract (C).

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			/* ** Duplicate C in 8 bytes of MM1 ** */
			mov al, C   	/* load C into AL */
			mov ah, al   	/* copy AL into AH */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm1, eax   	/* copy EAX into MM1 */
			movd mm2, eax   	/* copy EAX into MM2 */
			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L1023:
		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
		psubusb mm0,  mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
			movq [edi], mm0   	/* store result in SrcDest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L1023    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	/* Duplicate C in 8 bytes of MM1 */
	int i;
	memset(&i, C, 4);
	__m64 mm1 = _m_from_int(i);
	__m64 mm2 = _m_from_int(i);
	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_psubusb(*mSrc1, mm1);	/* Src1-C (sub 8 bytes with saturation) */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using SubByte: D = saturation0(S - C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.
\param C Constant to subtract (C).

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
{
	unsigned int i, istart;
	int iC;
	unsigned char *cursrc1;
	unsigned char *curdest;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	/* Special case: C==0 */
	if (C == 0) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		/* MMX routine */
		SDL_imageFilterSubByteMMX(Src1, Dest, length, C);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	iC = (int) C;
	for (i = istart; i < length; i++) {
		result = (int) *cursrc1 - iC;
		if (result < 0)
			result = 0;
		*curdest = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}
	return (0);
}

/*!
\brief Internal MMX Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param C Constant to subtract (C).
\param D Byteorder-swapped constant to subtract (Cs).

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
			mov eax, C   	/* load C into EAX */
			movd mm1, eax   	/* copy EAX into MM1 */
			mov eax, D   	/* load D into EAX */
			movd mm2, eax   	/* copy EAX into MM2 */
			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L11024:
		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
		psubusb mm0, mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
			movq [edi], mm0   	/* store result in SrcDest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L11024    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	/* Duplicate (int)C in 8 bytes of MM1 */
	__m64 mm1 = _m_from_int(C);
	__m64 mm2 = _m_from_int(C);
	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_psubusb(*mSrc1, mm1);	/* Src1-C (sub 8 bytes with saturation) */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)

\param Src1 Pointer to the start of the source byte array (S1).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param C Constant to subtract (C).

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
{
	unsigned int i, j, istart, D;
	int iC[4];
	unsigned char *cursrc1;
	unsigned char *curdest;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

    /* Special case: C==0 */
	if (C == 0) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		/* MMX routine */
		D=SWAP_32(C);
		SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	iC[3] = (int) ((C >> 24) & 0xff);
	iC[2] = (int) ((C >> 16) & 0xff);
	iC[1] = (int) ((C >>  8) & 0xff);
	iC[0] = (int) ((C >>  0) & 0xff);
	for (i = istart; i < length; i += 4) {
		for (j = 0; j < 4; j++) {
			if ((i+j)<length) {
				result = (int) *cursrc1 - iC[j];
				if (result < 0) result = 0;
				*curdest = (unsigned char) result;
				/* Advance pointers */
				cursrc1++;
				curdest++;
			}
		}
	}
	return (0);
}

/*!
\brief Internal MMX Filter using ShiftRight: D = saturation0(S >> N)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
\param Mask Byte array containing 8 bytes with 0x7F value.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
								 unsigned char *Mask)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov edx, Mask   	/* load Mask address into edx */
			movq mm0, [edx]   	/* load Mask into mm0 */
		xor ecx, ecx   	/* zero ECX */
			mov cl,  N 	/* load loop counter (N) into CL */
			movd mm3,  ecx 	/* copy (N) into MM3  */
			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
L10240:                  	/* ** Prepare proper bit-Mask in MM1 ** */
		psrlw mm1,  1 	/* shift 4 WORDS of MM1 1 bit to the right */
			pand mm1, mm0   // apply Mask to 8 BYTES of MM1 */
			/*  byte     0x0f, 0xdb, 0xc8 */
			dec               cl    	/* decrease loop counter */
			jnz            L10240    	/* check loop termination, proceed if required */
			/* ** Shift all bytes of the image ** */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L10241:
		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
		psrlw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the right */
			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
			/* byte     0x0f, 0xdb, 0xc1 */
			movq [edi], mm0   	/* store result in SrcDest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz            L10241    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	__m64 *mMask = (__m64*)Mask;
        __m64 mm1;
	int i;
	mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
	/* Prepare proper bit-Mask in MM1 */
	for (i = 0; i < N; i++) {
		mm1 = _m_psrlwi(mm1, 1);		/* shift 4 WORDS of MM1 1 bit to the right */
		mm1 = _m_pand(mm1, *mMask);		/* apply Mask to 8 BYTES of MM1 */
	}
        /* Shift all bytes of the image */
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm0 = _m_psrlwi(*mSrc1, N);	/* shift 4 WORDS of MM0 (N) bits to the right */
		*mDest = _m_pand(mm0, mm1);		/* apply proper bit-Mask to 8 BYTES of MM0 */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using ShiftRight: D = saturation0(S >> N)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param N Number of bit-positions to shift (N). Valid range is 0 to 8.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
{
	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
	unsigned int i, istart;
	unsigned char *cursrc1;
	unsigned char *curdest;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	/* Check shift */
	if (N > 8) {
		return (-1);
	}

	/* Special case: N==0 */
	if (N == 0) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		/* MMX routine */
		SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		*curdest = (unsigned char) *cursrc1 >> N;
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)

\param Src1 Pointer to the start of the source byte array (S1).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param N Number of bit-positions to shift (N).

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L13023:
		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
		psrld mm0, N
			movq [edi], mm0   	/* store result in SrcDest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L13023    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_psrldi(*mSrc1, N);
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)

\param Src1 Pointer to the start of the source byte array (S1).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param N Number of bit-positions to shift (N). Valid range is 0 to 32.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *curdest;
	unsigned int *icursrc1, *icurdest;
	unsigned int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if (N > 32) {
		return (-1);
	}

	/* Special case: N==0 */
	if (N == 0) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	icursrc1=(unsigned int *)cursrc1;
	icurdest=(unsigned int *)curdest;
	for (i = istart; i < length; i += 4) {
		if ((i+4)<length) {
			result = ((unsigned int)*icursrc1 >> N);
			*icurdest = result;
		}
		/* Advance pointers */
		icursrc1++;
		icurdest++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using MultByByte: D = saturation255(S * C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param C Constant to multiply with (C).

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			/* ** Duplicate C in 4 words of MM1 ** */
			mov al, C   	/* load C into AL */
			xor ah, ah   	/* zero AH */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm1, eax   	/* copy EAX into MM1 */
			movd mm2, eax   	/* copy EAX into MM2 */
			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
			pxor mm0, mm0   	/* zero MM0 register */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
			cmp al, 128   	/* if (C <= 128) execute more efficient code */
			jg             L10251
			align 16                 	/* 16 byte alignment of the loop entry */
L10250:
		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
		movq mm4, mm3   	/* copy MM3 into MM4  */
			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
			movq [edi], mm3   	/* store result in Dest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz            L10250    	/* check loop termination, proceed if required */
			jmp            L10252
			align 16                 	/* 16 byte alignment of the loop entry */
L10251:
		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
		movq mm4, mm3   	/* copy MM3 into MM4  */
			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
			/* ** Take abs value of the results (signed words) ** */
			movq mm5, mm3   	/* copy mm3 into mm5 */
			movq mm6, mm4   	/* copy mm4 into mm6 */
			psraw mm5, 15   	/* fill mm5 words with word sign bit */
			psraw mm6, 15   	/* fill mm6 words with word sign bit */
			pxor mm3, mm5   	/* take 1's compliment of only neg words */
			pxor mm4, mm6   	/* take 1's compliment of only neg words */
			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
			movq [edi], mm3   	/* store result in Dest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz            L10251    	/* check loop termination, proceed if required */
L10252:
		emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	__m64 mm0 = _m_from_int(0);				/* zero mm0 register */
	/* Duplicate C in 4 words of MM1 */
	int i;
	i = C | C<<16;
	__m64 mm1 = _m_from_int(i);
	__m64 mm2 = _m_from_int(i);
	mm1 = _m_punpckldq(mm1, mm2);				/* fill higher words of MM1 with C */
	// long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48;
        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
	if (C <= 128) {						/* if (C <= 128) execute more efficient code */
		for (i = 0; i < SrcLength/8; i++) {
			__m64 mm3, mm4;
			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
			mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
			mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
			mSrc1++;
			mDest++;
		}
	} else {
		for (i = 0; i < SrcLength/8; i++) {
			__m64 mm3, mm4, mm5, mm6;
			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
			mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
			mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
			/* Take abs value of the results (signed words) */
			mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
			mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
			mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
			mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
			mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
			mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
			mSrc1++;
			mDest++;
		}
	}
	_m_empty();						/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using MultByByte: D = saturation255(S * C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.
\param C Constant to multiply with (C).

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
{
	unsigned int i, istart;
	int iC;
	unsigned char *cursrc1;
	unsigned char *curdest;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	/* Special case: C==1 */
	if (C == 1) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	iC = (int) C;
	for (i = istart; i < length; i++) {
		result = (int) *cursrc1 * iC;
		if (result > 255)
			result = 255;
		*curdest = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using ShiftRightAndMultByByteMMX: D = saturation255((S >> N) * C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
\param C Constant to multiply with (C).

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
											  unsigned char C)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			/* ** Duplicate C in 4 words of MM1 ** */
			mov al, C   	/* load C into AL */
			xor ah, ah   	/* zero AH */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm1, eax   	/* copy EAX into MM1 */
			movd mm2, eax   	/* copy EAX into MM2 */
			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
			xor ecx, ecx   	/* zero ECX */
			mov cl, N   	/* load N into CL */
			movd mm7, ecx   	/* copy N into MM7 */
			pxor mm0, mm0   	/* zero MM0 register */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L1026:
		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
		movq mm4, mm3   	/* copy MM3 into MM4  */
			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
			psrlw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the right */
			psrlw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the right */
			pmullw mm3, mm1   	/* mul low  bytes of SrcDest by MM1 */
			pmullw mm4, mm1   	/* mul high bytes of SrcDest by MM1 */
			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
			movq [edi], mm3   	/* store result in Dest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L1026    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	__m64 mm0 = _m_from_int(0);			/* zero mm0 register */
	/* Duplicate C in 4 words of MM1 */
	int i;
	i = (C<<16)|C;
	__m64 mm1 = _m_from_int(i);
	__m64 mm2 = _m_from_int(i);
	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher words of MM1 with C */
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm3, mm4, mm5, mm6;
		mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
		mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
		mm3 = _m_psrlwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the right */
		mm4 = _m_psrlwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the right */
		mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
		mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
		*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using ShiftRightAndMultByByte: D = saturation255((S >> N) * C)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
\param C Constant to multiply with (C).

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
										   unsigned char C)
{
	unsigned int i, istart;
	int iC;
	unsigned char *cursrc1;
	unsigned char *curdest;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	/* Check shift */
	if (N > 8) {
		return (-1);
	}

	/* Special case: N==0 && C==1 */
	if ((N == 0) && (C == 1)) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	iC = (int) C;
	for (i = istart; i < length; i++) {
		result = (int) (*cursrc1 >> N) * iC;
		if (result > 255)
			result = 255;
		*curdest = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using ShiftLeftByte: D = (S << N)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source arrays.
\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
\param Mask Byte array containing 8 bytes of 0xFE value.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
									unsigned char *Mask)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov edx, Mask   	/* load Mask address into edx */
			movq mm0, [edx]   	/* load Mask into mm0 */
		xor ecx, ecx   	/* zero ECX */
			mov cl, N   	/* load loop counter (N) into CL */
			movd mm3, ecx   	/* copy (N) into MM3  */
			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
L10270:                  	/* ** Prepare proper bit-Mask in MM1 ** */
		psllw mm1, 1   	/* shift 4 WORDS of MM1 1 bit to the left */
			pand mm1, mm0        // apply Mask to 8 BYTES of MM1 */
			/*  byte     0x0f, 0xdb, 0xc8 */
			dec cl                  	/* decrease loop counter */
			jnz            L10270    	/* check loop termination, proceed if required */
			/* ** Shift all bytes of the image ** */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load SrcDest address into edi */
			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L10271:
		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
		psllw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the left */
			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
			/* byte     0x0f, 0xdb, 0xc1 */
			movq [edi], mm0   	/* store result in Dest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz            L10271    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	__m64 *mMask = (__m64*)Mask;
        __m64 mm1;
	int i;
	mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
	/* Prepare proper bit-Mask in MM1 */
	for (i = 0; i < N; i++) {
		mm1 = _m_psllwi(mm1, 1);		/* shift 4 WORDS of MM1 1 bit to the left */
		mm1 = _m_pand(mm1, *mMask);		/* apply Mask to 8 BYTES of MM1 */
	}
	/* ** Shift all bytes of the image ** */
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm0 = _m_psllwi(*mSrc1, N);	/* shift 4 WORDS of MM0 (N) bits to the left */
		*mDest = _m_pand(mm0, mm1);		/* apply proper bit-Mask to 8 BYTES of MM0 */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using ShiftLeftByte: D = (S << N)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source arrays.
\param N Number of bit-positions to shift (N). Valid range is 0 to 8.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
{
	static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
	unsigned int i, istart;
	unsigned char *cursrc1, *curdest;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if (N > 8) {
		return (-1);
	}

	/* Special case: N==0 */
	if (N == 0) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		result = ((int) *cursrc1 << N) & 0xff;
		*curdest = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using ShiftLeftUint: D = ((uint)S << N)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param N Number of bit-positions to shift (N). Valid range is 0 to 32.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L12023:
		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
		pslld mm0, N   	/* MM0=SrcDest+C (add 8 bytes with saturation) */
			movq [edi], mm0   	/* store result in SrcDest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L12023    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	int i;
	for (i = 0; i < SrcLength/8; i++) {
		*mDest = _m_pslldi(*mSrc1, N);	/* Src1+C (add 8 bytes with saturation) */
		mSrc1++;
		mDest++;
	}
	_m_empty();				/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using ShiftLeftUint: D = ((uint)S << N)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param N Number of bit-positions to shift (N). Valid range is 0 to 32.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *curdest;
	unsigned int *icursrc1, *icurdest;
	unsigned int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if (N > 32) {
		return (-1);
	}

	/* Special case: N==0 */
	if (N == 0) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	icursrc1=(unsigned int *)cursrc1;
	icurdest=(unsigned int *)curdest;
	for (i = istart; i < length; i += 4) {
		if ((i+4)<length) {
			result = ((unsigned int)*icursrc1 << N);
			*icurdest = result;
		}
		/* Advance pointers */
		icursrc1++;
		icurdest++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter ShiftLeft: D = saturation255(S << N)

\param Src1 Pointer to the start of the source byte array (S1).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param N Number of bit-positions to shift (N). Valid range is 0 to 8.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			xor eax, eax   	/* zero EAX */
			mov al, N   	/* load N into AL */
			movd mm7, eax   	/* copy N into MM7 */
			pxor mm0, mm0   	/* zero MM0 register */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
			cmp al, 7   	/* if (N <= 7) execute more efficient code */
			jg             L10281
			align 16                 	/* 16 byte alignment of the loop entry */
L10280:
		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
		movq mm4, mm3   	/* copy MM3 into MM4  */
			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the left */
			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the left */
			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
			movq [edi], mm3   	/* store result in Dest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz            L10280    	/* check loop termination, proceed if required */
			jmp            L10282
			align 16                 	/* 16 byte alignment of the loop entry */
L10281:
		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
		movq mm4, mm3   	/* copy MM3 into MM4  */
			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the left */
			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the left */
			/* ** Take abs value of the signed words ** */
			movq mm5, mm3   	/* copy mm3 into mm5 */
			movq mm6, mm4   	/* copy mm4 into mm6 */
			psraw mm5, 15   	/* fill mm5 words with word sign bit */
			psraw mm6, 15   	/* fill mm6 words with word sign bit */
			pxor mm3, mm5   	/* take 1's compliment of only neg words */
			pxor mm4, mm6   	/* take 1's compliment of only neg words */
			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
			movq [edi], mm3   	/* store result in Dest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz            L10281    	/* check loop termination, proceed if required */
L10282:
		emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	__m64 mm0 = _m_from_int(0);				/* zero mm0 register */
	int i;
	if (N <= 7) {						/* if (N <= 7) execute more efficient code */
		for (i = 0; i < SrcLength/8; i++) {
			__m64 mm3, mm4;
			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
			mm3 = _m_psllwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the left */
			mm4 = _m_psllwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the left */
			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
			mSrc1++;
			mDest++;
		}
	} else {
		for (i = 0; i < SrcLength/8; i++) {
			__m64 mm3, mm4, mm5, mm6;
			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
			mm3 = _m_psllwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the left */
			mm4 = _m_psllwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the left */
			/* Take abs value of the signed words */
			mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
			mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
			mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
			mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
			mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
			mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
			mSrc1++;
			mDest++;
		}
	}
	_m_empty();						/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter ShiftLeft: D = saturation255(S << N)

\param Src1 Pointer to the start of the source byte array (S1).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param N Number of bit-positions to shift (N). Valid range is 0 to 8.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
{
	unsigned int i, istart;
	unsigned char *cursrc1, *curdest;
	int result;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if (N > 8) {
		return (-1);
	}

	/* Special case: N==0 */
	if (N == 0) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		result = (int) *cursrc1 << N;
		if (result > 255)
			result = 255;
		*curdest = (unsigned char) result;
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}

	return (0);
}

/*!
\brief MMX BinarizeUsingThreshold: D = (S >= T) ? 255:0

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param T The threshold boundary (inclusive).

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			/* ** Duplicate T in 8 bytes of MM3 ** */
			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
			pcmpeqb mm2, mm2   	/* generate all 1's in mm2 */
			mov al, T   	/* load T into AL */
			mov ah, al   	/* copy AL into AH */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm3, eax   	/* copy EAX into MM3 */
			movd mm4, eax   	/* copy EAX into MM4 */
			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with T */
			psubusb mm2, mm3   	/* store 0xFF - T in MM2 */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L1029:
		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
		paddusb mm0, mm2   	/* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
			pcmpeqb mm0, mm1   	/* binarize 255:0, comparing to 255 */
			movq [edi], mm0   	/* store result in SrcDest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L1029    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	/* Duplicate T in 8 bytes of MM3 */
	__m64 mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
	__m64 mm2 = _m_pcmpeqb(mm2, mm2);			/* generate all 1's in mm1 */
	int i;
	memset(&i, T, 4);
	__m64 mm3 = _m_from_int(i);
	__m64 mm4 = _m_from_int(i);
	mm3 = _m_punpckldq(mm3, mm4);			/* fill higher bytes of MM3 with T */
	mm2 = _m_psubusb(mm2, mm3);			/* store 0xFF - T in MM2 */
        //__m64 mm3 = _m_from_int64(lli); // x86_64 only
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm0 = _m_paddusb(*mSrc1, mm2);	/* Src1+(0xFF-T) (add 8 bytes with saturation) */
		*mDest = _m_pcmpeqb(mm0, mm1);		/* binarize 255:0, comparing to 255 */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using BinarizeUsingThreshold: D = (S >= T) ? 255:0

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param T The threshold boundary (inclusive).

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
{
	unsigned int i, istart;
	unsigned char *cursrc1;
	unsigned char *curdest;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	/* Special case: T==0 */
	if (T == 0) {
		memset(Dest, 255, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		*curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param Tmin Lower (inclusive) boundary of the clipping range.
\param Tmax Upper (inclusive) boundary of the clipping range.

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
								  unsigned char Tmax)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
			/* ** Duplicate Tmax in 8 bytes of MM3 ** */
			mov al, Tmax   	/* load Tmax into AL */
			mov ah, al   	/* copy AL into AH */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm3, eax   	/* copy EAX into MM3 */
			movd mm4, eax   	/* copy EAX into MM4 */
			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with Tmax */
			psubusb mm1, mm3   	/* store 0xFF - Tmax in MM1 */
			/* ** Duplicate Tmin in 8 bytes of MM5 ** */
			mov al, Tmin   	/* load Tmin into AL */
			mov ah, al   	/* copy AL into AH */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm5, eax   	/* copy EAX into MM5 */
			movd mm4, eax   	/* copy EAX into MM4 */
			punpckldq mm5, mm4   	/* fill higher bytes of MM5 with Tmin */
			movq mm7, mm5   	/* copy MM5 into MM7 */
			paddusb mm7, mm1   	/* store 0xFF - Tmax + Tmin in MM7 */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L1030:
		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
		paddusb mm0, mm1   	/* MM0=SrcDest+(0xFF-Tmax) */
			psubusb mm0, mm7   	/* MM0=MM0-(0xFF-Tmax+Tmin) */
			paddusb mm0, mm5   	/* MM0=MM0+Tmin */
			movq [edi], mm0   	/* store result in Dest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L1030    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	__m64 mm1 = _m_pcmpeqb(mm1, mm1);	/* generate all 1's in mm1 */
	int i;
	/* Duplicate Tmax in 8 bytes of MM3 */
	__m64 mm3, mm4;
	memset(&i, Tmax, 4);
	mm3 = _m_from_int(i);
	mm4 = _m_from_int(i);
	mm3 = _m_punpckldq(mm3, mm4);		/* fill higher bytes of MM3 with Tmax */
	mm1 = _m_psubusb(mm1, mm3);		/* store 0xFF - Tmax in MM1 */
        //__m64 mm3 = _m_from_int64(lli); // x86_64 only
	/* Duplicate Tmax in 8 bytes of MM3 */
	__m64 mm5, mm7;
	memset(&i, Tmin, 4);
	mm5 = _m_from_int(i);
	mm4 = _m_from_int(i);
	mm5 = _m_punpckldq(mm5, mm4);		/* fill higher bytes of MM5 with Tmin */
	mm7 = _m_paddusb(mm5, mm1);	/* store 0xFF - Tmax + Tmin in MM7 */
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm0;
		mm0 = _m_paddusb(*mSrc1, mm1);	/* MM0=Src1+(0xFF-Tmax) */
		mm0 = _m_psubusb(mm0, mm7);	/* MM0=MM0-(0xFF-Tmax+Tmin) */
		*mDest = _m_paddusb(mm0, mm5);	/* MM0+Tmin */
		mSrc1++;
		mDest++;
	}
	_m_empty();				/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param Tmin Lower (inclusive) boundary of the clipping range.
\param Tmax Upper (inclusive) boundary of the clipping range.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
							   unsigned char Tmax)
{
	unsigned int i, istart;
	unsigned char *cursrc1;
	unsigned char *curdest;

	/* Validate input parameters */
	if ((Src1 == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	/* Special case: Tmin==0 && Tmax = 255 */
	if ((Tmin == 0) && (Tmax == 25)) {
		memcpy(Src1, Dest, length);
		return (0);
	}

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc1 = &Src1[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc1 = Src1;
		curdest = Dest;
	}

	/* C routine to process image */
	for (i = istart; i < length; i++) {
		if (*cursrc1 < Tmin) {
			*curdest = Tmin;
		} else if (*cursrc1 > Tmax) {
			*curdest = Tmax;
		} else {
			*curdest = *cursrc1;
		}
		/* Advance pointers */
		cursrc1++;
		curdest++;
	}

	return (0);
}

/*!
\brief Internal MMX Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)

\param Src1 Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param SrcLength The number of bytes in the source array.
\param Cmin Normalization constant (Cmin).
\param Cmax Normalization constant (Cmax).
\param Nmin Normalization constant (Nmin).
\param Nmax Normalization constant (Nmax).

\return Returns 0 for success or -1 for error.
*/
static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
									  int Nmin, int Nmax)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{
		pusha
			mov ax, WORD PTR Nmax   	/* load Nmax in AX */
			mov bx, WORD PTR Cmax   	/* load Cmax in BX */
			sub ax, WORD PTR Nmin   	/* AX = Nmax - Nmin */
			sub bx, WORD PTR Cmin   	/* BX = Cmax - Cmin */
			jz             L10311    	/* check division by zero */
			xor dx, dx   	/* prepare for division, zero DX */
			div               bx    	/* AX = AX/BX */
			jmp            L10312
L10311:
		mov ax, 255   	/* if div by zero, assume result max byte value */
L10312:                  	/* ** Duplicate AX in 4 words of MM0 ** */
		mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm0, eax   	/* copy EAX into MM0 */
			movd mm1, eax   	/* copy EAX into MM1 */
			punpckldq mm0, mm1   	/* fill higher words of MM0 with AX */
			/* ** Duplicate Cmin in 4 words of MM1 ** */
			mov ax, WORD PTR Cmin   	/* load Cmin into AX */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm1, eax   	/* copy EAX into MM1 */
			movd mm2, eax   	/* copy EAX into MM2 */
			punpckldq mm1, mm2   	/* fill higher words of MM1 with Cmin */
			/* ** Duplicate Nmin in 4 words of MM2 ** */
			mov ax, WORD PTR Nmin   	/* load Nmin into AX */
			mov bx, ax   	/* copy AX into BX */
			shl eax, 16   	/* shift 2 bytes of EAX left */
			mov ax, bx   	/* copy BX into AX */
			movd mm2, eax   	/* copy EAX into MM2 */
			movd mm3, eax   	/* copy EAX into MM3 */
			punpckldq mm2, mm3   	/* fill higher words of MM2 with Nmin */
			pxor mm7, mm7   	/* zero MM7 register */
			mov eax, Src1   	/* load Src1 address into eax */
			mov edi, Dest   	/* load Dest address into edi */
			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
			align 16                 	/* 16 byte alignment of the loop entry */
L1031:
		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
		movq mm4, mm3   	/* copy MM3 into MM4  */
			punpcklbw mm3, mm7   	/* unpack low  bytes of SrcDest into words */
			punpckhbw mm4, mm7   	/* unpack high bytes of SrcDest into words */
			psubusb mm3, mm1   	/* S-Cmin, low  bytes */
			psubusb mm4, mm1   	/* S-Cmin, high bytes */
			pmullw mm3, mm0   	/* MM0*(S-Cmin), low  bytes */
			pmullw mm4, mm0   	/* MM0*(S-Cmin), high bytes */
			paddusb mm3, mm2   	/* MM0*(S-Cmin)+Nmin, low  bytes */
			paddusb mm4, mm2   	/* MM0*(S-Cmin)+Nmin, high bytes */
			/* ** Take abs value of the signed words ** */
			movq mm5, mm3   	/* copy mm3 into mm5 */
			movq mm6, mm4   	/* copy mm4 into mm6 */
			psraw mm5, 15   	/* fill mm5 words with word sign bit */
			psraw mm6, 15   	/* fill mm6 words with word sign bit */
			pxor mm3, mm5   	/* take 1's compliment of only neg words */
			pxor mm4, mm6   	/* take 1's compliment of only neg words */
			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
			movq [edi], mm3   	/* store result in Dest */
			add eax, 8   	/* increase Src1 register pointer by 8 */
			add edi, 8   	/* increase Dest register pointer by 8 */
			dec              ecx    	/* decrease loop counter */
			jnz             L1031    	/* check loop termination, proceed if required */
			emms                      	/* exit MMX state */
			popa
	}
#else
	/* i386 and x86_64 */
	__m64 *mSrc1 = (__m64*)Src1;
	__m64 *mDest = (__m64*)Dest;
	__m64 mm0, mm1, mm2, mm3;

	int i;
	/* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */
	unsigned short a = Nmax - Nmin;
	unsigned short b = Cmax - Cmin;
	if (b == 0) {
	    a = 255;
	} else {
	    a /= b;
	}
	i = (a<<16)|a;
	mm0 = _m_from_int(i);
	mm1 = _m_from_int(i);
	mm0 = _m_punpckldq(mm0, mm1);			/* fill higher words of MM0 with AX */
	/* Duplicate Cmin in 4 words of MM1 */
	i = (Cmin<<16)|(short)Cmin;
	mm1 = _m_from_int(i);
	mm2 = _m_from_int(i);
	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher words of MM1 with Cmin */
	/* Duplicate Nmin in 4 words of MM2 */
	i = (Nmin<<16)|(short)Nmin;
	mm2 = _m_from_int(i);
	mm3 = _m_from_int(i);
	mm2 = _m_punpckldq(mm2, mm3);			/* fill higher words of MM2 with Nmin */
	__m64 mm7 = _m_from_int(0);			/* zero mm0 register */
	for (i = 0; i < SrcLength/8; i++) {
		__m64 mm3, mm4, mm5, mm6;
		mm3 = _m_punpcklbw(*mSrc1, mm7);	/* unpack low  bytes of Src1 into words */
		mm4 = _m_punpckhbw(*mSrc1, mm7);	/* unpack high bytes of Src1 into words */
		mm3 = _m_psubusb(mm3, mm1);		/* S-Cmin, low	bytes */
		mm4 = _m_psubusb(mm4, mm1);		/* S-Cmin, high bytes */
		mm3 = _m_pmullw(mm3, mm0);		/* MM0*(S-Cmin), low  bytes */
		mm4 = _m_pmullw(mm4, mm0);		/* MM0*(S-Cmin), high bytes */
		mm3 = _m_paddusb(mm3, mm2);		/* MM0*(S-Cmin)+Nmin, low  bytes */
		mm4 = _m_paddusb(mm4, mm2);		/* MM0*(S-Cmin)+Nmin, high bytes */
		/* Take abs value of the signed words */
		mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
		mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
		mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
		mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
		mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
		mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
		*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
		mSrc1++;
		mDest++;
	}
	_m_empty();					/* clean MMX state */
#endif
	return (0);
#else
	return (-1);
#endif
}

/*!
\brief Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)

\param Src Pointer to the start of the source byte array (S).
\param Dest Pointer to the start of the destination byte array (D).
\param length The number of bytes in the source array.
\param Cmin Normalization constant.
\param Cmax Normalization constant.
\param Nmin Normalization constant.
\param Nmax Normalization constant.

\return Returns 0 for success or -1 for error.
*/
int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
								   int Nmax)
{
	unsigned int i, istart;
	unsigned char *cursrc;
	unsigned char *curdest;
	int dN, dC, factor;
	int result;

	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL))
		return(-1);
	if (length == 0)
		return(0);

	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {

		SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);

		/* Check for unaligned bytes */
		if ((length & 7) > 0) {
			/* Setup to process unaligned bytes */
			istart = length & 0xfffffff8;
			cursrc = &Src[istart];
			curdest = &Dest[istart];
		} else {
			/* No unaligned bytes - we are done */
			return (0);
		}
	} else {
		/* Setup to process whole image */
		istart = 0;
		cursrc = Src;
		curdest = Dest;
	}

	/* C routine to process image */
	dC = Cmax - Cmin;
	if (dC == 0)
		return (0);
	dN = Nmax - Nmin;
	factor = dN / dC;
	for (i = istart; i < length; i++) {
		result = factor * ((int) (*cursrc) - Cmin) + Nmin;
		if (result > 255)
			result = 255;
		*curdest = (unsigned char) result;
		/* Advance pointers */
		cursrc++;
		curdest++;
	}

	return (0);
}

/* ------------------------------------------------------------------------------------ */

/*!
\brief Filter using ConvolveKernel3x3Divide: Dij = saturation0and255( ... )

\param Src The source 2D byte array to convolve. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >2.
\param columns Number of columns in source/destination array. Must be >2.
\param Kernel The 2D convolution kernel of size 3x3.
\param Divisor The divisor of the convolution sum. Must be >0.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
										   signed short *Kernel, unsigned char Divisor)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
		return(-1);

	if ((columns < 3) || (rows < 3) || (Divisor == 0))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				xor ebx, ebx   	/* zero EBX */
				mov bl, Divisor   	/* load Divisor into BL */
				mov edx, Kernel   	/* load Kernel address into EDX */
				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
			add edx, 8   	/* second row              |K0 K1 K2 0| */
				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
			add edx, 8   	/* third row               |K6 K7 K8 0| */
				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
			/* ---, */
			mov eax, columns   	/* load columns into EAX */
				mov esi, Src   	/* ESI = Src row 0 address */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, eax   	/* EDI = EDI + columns */
				inc              edi    	/* 1 byte offset from the left edge */
				mov edx, rows   	/* initialize ROWS counter */
				sub edx, 2   	/* do not use first and last row */
				/* ---, */
L10320:
			mov ecx, eax   	/* initialize COLUMS counter */
				sub ecx, 2   	/* do not use first and last column */
				align 16                 	/* 16 byte alignment of the loop entry */
L10322:
			/* ---, */
			movq mm1, [esi]   	/* load 8 bytes of the image first row */
			add esi, eax   	/* move one row below */
				movq mm2, [esi]   	/* load 8 bytes of the image second row */
			add esi, eax   	/* move one row below */
				movq mm3, [esi]   	/* load 8 bytes of the image third row */
			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
				paddsw mm1, mm3   	/* add 4 words of the third row and result */
				movq mm2, mm1   	/* copy MM1 into MM2 */
				psrlq mm1, 32   	/* shift 2 left words to the right */
				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
				movq mm3, mm1   	/* copy MM1 into MM3 */
				psrlq mm1, 16   	/* shift 1 left word to the right */
				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
				/* --, */
				movd mm2, eax   	/* save EAX in MM2 */
				movd mm3, edx   	/* save EDX in MM3 */
				movd eax, mm1   	/* copy MM1 into EAX */
				psraw mm1, 15   	/* spread sign bit of the result */
				movd edx, mm1   	/* fill EDX with a sign bit */
				idiv bx    	/* IDIV - VERY EXPENSIVE */
				movd mm1, eax   	/* move result of division into MM1 */
				packuswb mm1, mm0   	/* pack division result with saturation */
				movd eax, mm1   	/* copy saturated result into EAX */
				mov [edi], al   	/* copy a byte result into Dest */
				movd edx, mm3   	/* restore saved EDX */
				movd eax, mm2   	/* restore saved EAX */
				/* --, */
				sub esi, eax   	/* move two rows up */
				sub esi, eax   	/* */
				inc              esi    	/* move Src  pointer to the next pixel */
				inc              edi    	/* move Dest pointer to the next pixel */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10322    	/* check loop termination, proceed if required */
				add esi, 2   	/* move to the next row in Src */
				add edi, 2   	/* move to the next row in Dest */
				dec              edx    	/* decrease loop counter ROWS */
				jnz            L10320    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
			/* --- */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
			"sub          $2, %%edx \n\t"	/* do not use first and last row */
			/* --- */
			".L10320:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
			"sub          $2, %%ecx \n\t"	/* do not use first and last column */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10322:               \n\t"
			/* --- */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the image first row */
			"add       %%eax, %%esi \n\t"	/* move one row below */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes of the image second row */
			"add       %%eax, %%esi \n\t"	/* move one row below */
			"movq    (%%esi), %%mm3 \n\t"	/* load 8 bytes of the image third row */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first 4 bytes into words */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack first 4 bytes into words */
			"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack first 4 bytes into words */
			"pmullw    %%mm5, %%mm1 \n\t"	/* multiply words first row  image*Kernel */
			"pmullw    %%mm6, %%mm2 \n\t"	/* multiply words second row image*Kernel */
			"pmullw    %%mm7, %%mm3 \n\t"	/* multiply words third row  image*Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the first and second rows */
			"paddsw    %%mm3, %%mm1 \n\t"	/* add 4 words of the third row and result */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"psrlq       $32, %%mm1 \n\t"	/* shift 2 left words to the right */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 2 left and 2 right result words */
			"movq      %%mm1, %%mm3 \n\t"	/* copy MM1 into MM3 */
			"psrlq       $16, %%mm1 \n\t"	/* shift 1 left word to the right */
			"paddsw    %%mm3, %%mm1 \n\t"	/* add 1 left and 1 right result words */
			/* -- */
			"movd      %%eax, %%mm2 \n\t"	/* save EAX in MM2 */
			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
			"movd      %%mm1, %%eax \n\t"	/* copy MM1 into EAX */
			"psraw       $15, %%mm1 \n\t"	/* spread sign bit of the result */
			"movd      %%mm1, %%edx \n\t"	/* fill EDX with a sign bit */
			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
			"movd      %%eax, %%mm1 \n\t"	/* move result of division into MM1 */
			"packuswb  %%mm0, %%mm1 \n\t"	/* pack division result with saturation */
			"movd      %%mm1, %%eax \n\t"	/* copy saturated result into EAX */
			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
			"movd      %%mm2, %%eax \n\t"	/* restore saved EAX */
			/* -- */
			"sub       %%eax, %%esi \n\t"	/* move two rows up */
			"sub       %%eax, %%esi \n\t"	/* */
			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10322 \n\t"	/* check loop termination, proceed if required */
			"add          $2, %%esi \n\t"	/* move to the next row in Src */
			"add          $2, %%edi \n\t"	/* move to the next row in Dest */
			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10320 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns),		/* %3 */
			"m"(Kernel),		/* %4 */
			"m"(Divisor)		/* %5 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/*!
\brief Filter using ConvolveKernel5x5Divide: Dij = saturation0and255( ... )

\param Src The source 2D byte array to convolve. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >4.
\param columns Number of columns in source/destination array. Must be >4.
\param Kernel The 2D convolution kernel of size 5x5.
\param Divisor The divisor of the convolution sum. Must be >0.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
										   signed short *Kernel, unsigned char Divisor)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
		return(-1);

	if ((columns < 5) || (rows < 5) || (Divisor == 0))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				xor ebx, ebx   	/* zero EBX */
				mov bl, Divisor   	/* load Divisor into BL */
				movd mm5, ebx   	/* copy Divisor into MM5 */
				mov edx, Kernel   	/* load Kernel address into EDX */
				mov esi, Src   	/* load Src  address to ESI */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, 2   	/* 2 column offset from the left edge */
				mov eax, columns   	/* load columns into EAX */
				shl eax, 1   	/* EAX = columns * 2 */
				add edi, eax   	/* 2 row offset from the top edge */
				shr eax, 1   	/* EAX = columns */
				mov ebx, rows   	/* initialize ROWS counter */
				sub ebx, 4   	/* do not use first 2 and last 2 rows */
				/* ---, */
L10330:
			mov ecx, eax   	/* initialize COLUMNS counter */
				sub ecx, 4   	/* do not use first 2 and last 2 columns */
				align 16                 	/* 16 byte alignment of the loop entry */
L10332:
			pxor mm7, mm7   	/* zero MM7 (accumulator) */
				movd mm6, esi   	/* save ESI in MM6 */
				/* --- 1 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 2 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 3 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 4 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 5 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* ---, */
				movq mm3, mm7   	/* copy MM7 into MM3 */
				psrlq mm7, 32   	/* shift 2 left words to the right */
				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
				movq mm2, mm7   	/* copy MM7 into MM2 */
				psrlq mm7, 16   	/* shift 1 left word to the right */
				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
				/* ---, */
				movd mm1, eax   	/* save EDX in MM1 */
				movd mm2, ebx   	/* save EDX in MM2 */
				movd mm3, edx   	/* save EDX in MM3 */
				movd eax, mm7   	/* load summation result into EAX */
				psraw mm7, 15   	/* spread sign bit of the result */
				movd ebx, mm5   	/* load Divisor into EBX */
				movd edx, mm7   	/* fill EDX with a sign bit */
				idiv bx    	/* IDIV - VERY EXPENSIVE */
				movd mm7, eax   	/* move result of division into MM7 */
				packuswb mm7, mm0   	/* pack division result with saturation */
				movd eax, mm7   	/* copy saturated result into EAX */
				mov [edi], al   	/* copy a byte result into Dest */
				movd edx, mm3   	/* restore saved EDX */
				movd ebx, mm2   	/* restore saved EBX */
				movd eax, mm1   	/* restore saved EAX */
				/* --, */
				movd esi, mm6   	/* move Src pointer to the top pixel */
				sub edx, 72   	/* EDX = Kernel address */
				inc              esi    	/* move Src  pointer to the next pixel */
				inc              edi    	/* move Dest pointer to the next pixel */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10332    	/* check loop termination, proceed if required */
				add esi, 4   	/* move to the next row in Src */
				add edi, 4   	/* move to the next row in Dest */
				dec              ebx    	/* decrease loop counter ROWS */
				jnz            L10330    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add          $2, %%edi \n\t"	/* 2 column offset from the left edge */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			"shl          $1, %%eax \n\t"	/* EAX = columns * 2 */
			"add       %%eax, %%edi \n\t"	/* 2 row offset from the top edge */
			"shr          $1, %%eax \n\t"	/* EAX = columns */
			"mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
			"sub          $4, %%ebx \n\t"	/* do not use first 2 and last 2 rows */
			/* --- */
			".L10330:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
			"sub          $4, %%ecx \n\t"	/* do not use first 2 and last 2 columns */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10332:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
			/* --- 1 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 2 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 3 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 4 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 5 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- */
			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
			/* --- */
			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
			/* -- */
			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
			"sub         $72, %%edx \n\t"	/* EDX = Kernel address */
			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10332 \n\t"	/* check loop termination, proceed if required */
			"add          $4, %%esi \n\t"	/* move to the next row in Src */
			"add          $4, %%edi \n\t"	/* move to the next row in Dest */
			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10330 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns),		/* %3 */
			"m"(Kernel),		/* %4 */
			"m"(Divisor)		/* %5 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/*!
\brief Filter using ConvolveKernel7x7Divide: Dij = saturation0and255( ... )

\param Src The source 2D byte array to convolve. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >6.
\param columns Number of columns in source/destination array. Must be >6.
\param Kernel The 2D convolution kernel of size 7x7.
\param Divisor The divisor of the convolution sum. Must be >0.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
										   signed short *Kernel, unsigned char Divisor)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
		return(-1);

	if ((columns < 7) || (rows < 7) || (Divisor == 0))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				xor ebx, ebx   	/* zero EBX */
				mov bl, Divisor   	/* load Divisor into BL */
				movd mm5, ebx   	/* copy Divisor into MM5 */
				mov edx, Kernel  	/* load Kernel address into EDX */
				mov esi, Src   	/* load Src  address to ESI */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, 3   	/* 3 column offset from the left edge */
				mov eax, columns   	/* load columns into EAX */
				add edi, eax   	/* 3 row offset from the top edge */
				add edi, eax
				add edi, eax
				mov ebx, rows   	/* initialize ROWS counter */
				sub ebx, 6   	/* do not use first 3 and last 3 rows */
				/* ---, */
L10340:
			mov ecx, eax   	/* initialize COLUMNS counter */
				sub ecx, 6   	/* do not use first 3 and last 3 columns */
				align 16                 	/* 16 byte alignment of the loop entry */
L10342:
			pxor mm7, mm7   	/* zero MM7 (accumulator) */
				movd mm6, esi   	/* save ESI in MM6 */
				/* --- 1 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 2 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 3 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 4 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 5 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 6 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* ---, */
				movq mm3, mm7   	/* copy MM7 into MM3 */
				psrlq mm7, 32   	/* shift 2 left words to the right */
				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
				movq mm2, mm7   	/* copy MM7 into MM2 */
				psrlq mm7, 16   	/* shift 1 left word to the right */
				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
				/* ---, */
				movd mm1, eax   	/* save EDX in MM1 */
				movd mm2, ebx   	/* save EDX in MM2 */
				movd mm3, edx   	/* save EDX in MM3 */
				movd eax, mm7   	/* load summation result into EAX */
				psraw mm7, 15   	/* spread sign bit of the result */
				movd ebx, mm5   	/* load Divisor into EBX */
				movd edx, mm7   	/* fill EDX with a sign bit */
				idiv bx    	/* IDIV - VERY EXPENSIVE */
				movd mm7, eax   	/* move result of division into MM7 */
				packuswb mm7, mm0   	/* pack division result with saturation */
				movd eax, mm7   	/* copy saturated result into EAX */
				mov [edi], al   	/* copy a byte result into Dest */
				movd edx, mm3   	/* restore saved EDX */
				movd ebx, mm2   	/* restore saved EBX */
				movd eax, mm1   	/* restore saved EAX */
				/* --, */
				movd esi, mm6   	/* move Src pointer to the top pixel */
				sub edx, 104   	/* EDX = Kernel address */
				inc              esi    	/* move Src  pointer to the next pixel */
				inc              edi    	/* move Dest pointer to the next pixel */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10342    	/* check loop termination, proceed if required */
				add esi, 6   	/* move to the next row in Src */
				add edi, 6   	/* move to the next row in Dest */
				dec              ebx    	/* decrease loop counter ROWS */
				jnz            L10340    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add          $3, %%edi \n\t"	/* 3 column offset from the left edge */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			"add       %%eax, %%edi \n\t"	/* 3 row offset from the top edge */
			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
			"sub          $6, %%ebx \n\t"	/* do not use first 3 and last 3 rows */
			/* --- */
			".L10340:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
			"sub          $6, %%ecx \n\t"	/* do not use first 3 and last 3 columns */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10342:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
			/* --- 1 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 2 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 3 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 4 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 5 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 6 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- */
			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
			/* --- */
			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
			/* -- */
			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
			"sub        $104, %%edx \n\t"	/* EDX = Kernel address */
			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10342 \n\t"	/* check loop termination, proceed if required */
			"add          $6, %%esi \n\t"	/* move to the next row in Src */
			"add          $6, %%edi \n\t"	/* move to the next row in Dest */
			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10340 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns),		/* %3 */
			"m"(Kernel),		/* %4 */
			"m"(Divisor)		/* %5 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/*!
\brief Filter using ConvolveKernel9x9Divide: Dij = saturation0and255( ... )

\param Src The source 2D byte array to convolve. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >8.
\param columns Number of columns in source/destination array. Must be >8.
\param Kernel The 2D convolution kernel of size 9x9.
\param Divisor The divisor of the convolution sum. Must be >0.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
										   signed short *Kernel, unsigned char Divisor)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
		return(-1);

	if ((columns < 9) || (rows < 9) || (Divisor == 0))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				xor ebx, ebx   	/* zero EBX */
				mov bl, Divisor   	/* load Divisor into BL */
				movd mm5, ebx   	/* copy Divisor into MM5 */
				mov edx, Kernel   	/* load Kernel address into EDX */
				mov esi, Src   	/* load Src  address to ESI */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, 4   	/* 4 column offset from the left edge */
				mov eax, columns   	/* load columns into EAX */
				add edi, eax   	/* 4 row offset from the top edge */
				add edi, eax
				add edi, eax
				add edi, eax
				mov ebx, rows   	/* initialize ROWS counter */
				sub ebx, 8   	/* do not use first 4 and last 4 rows */
				/* ---, */
L10350:
			mov ecx, eax   	/* initialize COLUMNS counter */
				sub ecx, 8   	/* do not use first 4 and last 4 columns */
				align 16                 	/* 16 byte alignment of the loop entry */
L10352:
			pxor mm7, mm7   	/* zero MM7 (accumulator) */
				movd mm6, esi   	/* save ESI in MM6 */
				/* --- 1 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 2 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 3 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 4 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 5 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 6 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 8 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 9 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm3, [edx]   	/* load 4 words of Kernel */
			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* ---, */
				movq mm3, mm7   	/* copy MM7 into MM3 */
				psrlq mm7, 32   	/* shift 2 left words to the right */
				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
				movq mm2, mm7   	/* copy MM7 into MM2 */
				psrlq mm7, 16   	/* shift 1 left word to the right */
				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
				/* ---, */
				movd mm1, eax   	/* save EDX in MM1 */
				movd mm2, ebx   	/* save EDX in MM2 */
				movd mm3, edx   	/* save EDX in MM3 */
				movd eax, mm7   	/* load summation result into EAX */
				psraw mm7, 15   	/* spread sign bit of the result */
				movd ebx, mm5   	/* load Divisor into EBX */
				movd edx, mm7   	/* fill EDX with a sign bit */
				idiv bx    	/* IDIV - VERY EXPENSIVE */
				movd mm7, eax   	/* move result of division into MM7 */
				packuswb mm7, mm0   	/* pack division result with saturation */
				movd eax, mm7   	/* copy saturated result into EAX */
				mov [edi], al   	/* copy a byte result into Dest */
				movd edx, mm3   	/* restore saved EDX */
				movd ebx, mm2   	/* restore saved EBX */
				movd eax, mm1   	/* restore saved EAX */
				/* --, */
				movd esi, mm6   	/* move Src pointer to the top pixel */
				sub edx, 208   	/* EDX = Kernel address */
				inc              esi    	/* move Src  pointer to the next pixel */
				inc              edi    	/* move Dest pointer to the next pixel */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10352    	/* check loop termination, proceed if required */
				add esi, 8   	/* move to the next row in Src */
				add edi, 8   	/* move to the next row in Dest */
				dec              ebx    	/* decrease loop counter ROWS */
				jnz            L10350    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add          $4, %%edi \n\t"	/* 4 column offset from the left edge */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			"add       %%eax, %%edi \n\t"	/* 4 row offset from the top edge */
			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
			"sub          $8, %%ebx \n\t"	/* do not use first 4 and last 4 rows */
			/* --- */
			".L10350:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
			"sub          $8, %%ecx \n\t"	/* do not use first 4 and last 4 columns */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10352:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
			/* --- 1 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 2 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 3 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 4 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 5 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 6 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 8 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 9 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- */
			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
			/* --- */
			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
			/* -- */
			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
			"sub        $208, %%edx \n\t"	/* EDX = Kernel address */
			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10352 \n\t"	/* check loop termination, proceed if required */
			"add          $8, %%esi \n\t"	/* move to the next row in Src */
			"add          $8, %%edi \n\t"	/* move to the next row in Dest */
			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10350 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns),		/* %3 */
			"m"(Kernel),		/* %4 */
			"m"(Divisor)		/* %5 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/*!
\brief Filter using ConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... )

\param Src The source 2D byte array to convolve. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >2.
\param columns Number of columns in source/destination array. Must be >2.
\param Kernel The 2D convolution kernel of size 3x3.
\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
											   signed short *Kernel, unsigned char NRightShift)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
		return(-1);

	if ((columns < 3) || (rows < 3) || (NRightShift > 7))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				xor ebx, ebx   	/* zero EBX */
				mov bl, NRightShift   	/* load NRightShift into BL */
				movd mm4, ebx   	/* copy NRightShift into MM4 */
				mov edx, Kernel   	/* load Kernel address into EDX */
				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
			add edx, 8   	/* second row              |K0 K1 K2 0| */
				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
			add edx, 8   	/* third row               |K6 K7 K8 0| */
				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
			/* ---, */
			mov eax, columns   	/* load columns into EAX */
				mov esi, Src   	/* ESI = Src row 0 address */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, eax   	/* EDI = EDI + columns */
				inc              edi    	/* 1 byte offset from the left edge */
				mov edx, rows   	/* initialize ROWS counter */
				sub edx, 2   	/* do not use first and last row */
				/* ---, */
L10360:
			mov ecx, eax   	/* initialize COLUMS counter */
				sub ecx, 2   	/* do not use first and last column */
				align 16                 	/* 16 byte alignment of the loop entry */
L10362:
			/* ---, */
			movq mm1, [esi]   	/* load 8 bytes of the image first row */
			add esi, eax   	/* move one row below */
				movq mm2, [esi]   	/* load 8 bytes of the image second row */
			add esi, eax   	/* move one row below */
				movq mm3, [esi]   	/* load 8 bytes of the image third row */
			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
				psrlw mm1, mm4   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm4   	/* shift right each pixel NshiftRight times */
				psrlw mm3, mm4   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
				paddsw mm1, mm3   	/* add 4 words of the third row and result */
				movq mm2, mm1   	/* copy MM1 into MM2 */
				psrlq mm1, 32   	/* shift 2 left words to the right */
				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
				movq mm3, mm1   	/* copy MM1 into MM3 */
				psrlq mm1, 16   	/* shift 1 left word to the right */
				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
				packuswb mm1, mm0   	/* pack shift result with saturation */
				movd ebx, mm1   	/* copy saturated result into EBX */
				mov [edi], bl   	/* copy a byte result into Dest */
				/* --, */
				sub esi, eax   	/* move two rows up */
				sub esi, eax
				inc              esi    	/* move Src  pointer to the next pixel */
				inc              edi    	/* move Dest pointer to the next pixel */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10362    	/* check loop termination, proceed if required */
				add esi, 2   	/* move to the next row in Src */
				add edi, 2   	/* move to the next row in Dest */
				dec              edx    	/* decrease loop counter ROWS */
				jnz            L10360    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
			"movd      %%ebx, %%mm4 \n\t"	/* copy NRightShift into MM4 */
			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
			/* --- */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
			"sub          $2, %%edx \n\t"	/* do not use first and last row */
			/* --- */
			".L10360:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
			"sub          $2, %%ecx \n\t"	/* do not use first and last column */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10362:               \n\t"
			/* --- */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the image first row */
			"add       %%eax, %%esi \n\t"	/* move one row below */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes of the image second row */
			"add       %%eax, %%esi \n\t"	/* move one row below */
			"movq    (%%esi), %%mm3 \n\t"	/* load 8 bytes of the image third row */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first 4 bytes into words */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack first 4 bytes into words */
			"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack first 4 bytes into words */
			"psrlw     %%mm4, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm4, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm4, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm5, %%mm1 \n\t"	/* multiply words first row  image*Kernel */
			"pmullw    %%mm6, %%mm2 \n\t"	/* multiply words second row image*Kernel */
			"pmullw    %%mm7, %%mm3 \n\t"	/* multiply words third row  image*Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the first and second rows */
			"paddsw    %%mm3, %%mm1 \n\t"	/* add 4 words of the third row and result */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"psrlq       $32, %%mm1 \n\t"	/* shift 2 left words to the right */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 2 left and 2 right result words */
			"movq      %%mm1, %%mm3 \n\t"	/* copy MM1 into MM3 */
			"psrlq       $16, %%mm1 \n\t"	/* shift 1 left word to the right */
			"paddsw    %%mm3, %%mm1 \n\t"	/* add 1 left and 1 right result words */
			"packuswb  %%mm0, %%mm1 \n\t"	/* pack shift result with saturation */
			"movd      %%mm1, %%ebx \n\t"	/* copy saturated result into EBX */
			"mov      %%bl, (%%edi) \n\t"	/* copy a byte result into Dest */
			/* -- */
			"sub       %%eax, %%esi \n\t"	/* move two rows up */
			"sub       %%eax, %%esi \n\t" "inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10362 \n\t"	/* check loop termination, proceed if required */
			"add          $2, %%esi \n\t"	/* move to the next row in Src */
			"add          $2, %%edi \n\t"	/* move to the next row in Dest */
			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10360 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns),		/* %3 */
			"m"(Kernel),		/* %4 */
			"m"(NRightShift)	/* %5 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/*!
\brief Filter using ConvolveKernel5x5ShiftRight: Dij = saturation0and255( ... )

\param Src The source 2D byte array to convolve. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >4.
\param columns Number of columns in source/destination array. Must be >4.
\param Kernel The 2D convolution kernel of size 5x5.
\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
											   signed short *Kernel, unsigned char NRightShift)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
		return(-1);

	if ((columns < 5) || (rows < 5) || (NRightShift > 7))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				xor ebx, ebx   	/* zero EBX */
				mov bl, NRightShift   	/* load NRightShift into BL */
				movd mm5, ebx   	/* copy NRightShift into MM5 */
				mov edx, Kernel   	/* load Kernel address into EDX */
				mov esi, Src   	/* load Src  address to ESI */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, 2   	/* 2 column offset from the left edge */
				mov eax, columns   	/* load columns into EAX */
				shl eax, 1   	/* EAX = columns * 2 */
				add edi, eax   	/* 2 row offset from the top edge */
				shr eax, 1   	/* EAX = columns */
				mov ebx, rows   	/* initialize ROWS counter */
				sub ebx, 4   	/* do not use first 2 and last 2 rows */
				/* ---, */
L10370:
			mov ecx, eax   	/* initialize COLUMNS counter */
				sub ecx, 4   	/* do not use first 2 and last 2 columns */
				align 16                 	/* 16 byte alignment of the loop entry */
L10372:
			pxor mm7, mm7   	/* zero MM7 (accumulator) */
				movd mm6, esi   	/* save ESI in MM6 */
				/* --- 1 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 2 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 3 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 4 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 5 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* ---, */
				movq mm3, mm7   	/* copy MM7 into MM3 */
				psrlq mm7, 32   	/* shift 2 left words to the right */
				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
				movq mm2, mm7   	/* copy MM7 into MM2 */
				psrlq mm7, 16   	/* shift 1 left word to the right */
				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
				movd mm1, eax   	/* save EAX in MM1 */
				packuswb mm7, mm0   	/* pack division result with saturation */
				movd eax, mm7   	/* copy saturated result into EAX */
				mov [edi], al   	/* copy a byte result into Dest */
				movd eax, mm1   	/* restore saved EAX */
				/* --, */
				movd esi, mm6   	/* move Src pointer to the top pixel */
				sub edx, 72   	/* EDX = Kernel address */
				inc              esi    	/* move Src  pointer to the next pixel */
				inc              edi    	/* move Dest pointer to the next pixel */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10372    	/* check loop termination, proceed if required */
				add esi, 4   	/* move to the next row in Src */
				add edi, 4   	/* move to the next row in Dest */
				dec              ebx    	/* decrease loop counter ROWS */
				jnz            L10370    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add          $2, %%edi \n\t"	/* 2 column offset from the left edge */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			"shl          $1, %%eax \n\t"	/* EAX = columns * 2 */
			"add       %%eax, %%edi \n\t"	/* 2 row offset from the top edge */
			"shr          $1, %%eax \n\t"	/* EAX = columns */
			"mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
			"sub          $4, %%ebx \n\t"	/* do not use first 2 and last 2 rows */
			/* --- */
			".L10370:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
			"sub          $4, %%ecx \n\t"	/* do not use first 2 and last 2 columns */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10372:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
			/* --- 1 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 2 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 3 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 4 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 5 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- */
			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
			/* -- */
			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
			"sub         $72, %%edx \n\t"	/* EDX = Kernel address */
			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10372 \n\t"	/* check loop termination, proceed if required */
			"add          $4, %%esi \n\t"	/* move to the next row in Src */
			"add          $4, %%edi \n\t"	/* move to the next row in Dest */
			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10370 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns),		/* %3 */
			"m"(Kernel),		/* %4 */
			"m"(NRightShift)	/* %5 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/*!
\brief Filter using ConvolveKernel7x7ShiftRight: Dij = saturation0and255( ... )

\param Src The source 2D byte array to convolve. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >6.
\param columns Number of columns in source/destination array. Must be >6.
\param Kernel The 2D convolution kernel of size 7x7.
\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
											   signed short *Kernel, unsigned char NRightShift)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
		return(-1);

	if ((columns < 7) || (rows < 7) || (NRightShift > 7))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				xor ebx, ebx   	/* zero EBX */
				mov bl, NRightShift   	/* load NRightShift into BL */
				movd mm5, ebx   	/* copy NRightShift into MM5 */
				mov edx, Kernel   	/* load Kernel address into EDX */
				mov esi, Src   	/* load Src  address to ESI */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, 3   	/* 3 column offset from the left edge */
				mov eax, columns   	/* load columns into EAX */
				add edi, eax   	/* 3 row offset from the top edge */
				add edi, eax
				add edi, eax
				mov ebx, rows   	/* initialize ROWS counter */
				sub ebx, 6   	/* do not use first 3 and last 3 rows */
				/* ---, */
L10380:
			mov ecx, eax   	/* initialize COLUMNS counter */
				sub ecx, 6   	/* do not use first 3 and last 3 columns */
				align 16                 	/* 16 byte alignment of the loop entry */
L10382:
			pxor mm7, mm7   	/* zero MM7 (accumulator) */
				movd mm6, esi   	/* save ESI in MM6 */
				/* --- 1 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 2 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 3 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 4 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 5 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 6 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* ---, */
				movq mm3, mm7   	/* copy MM7 into MM3 */
				psrlq mm7, 32   	/* shift 2 left words to the right */
				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
				movq mm2, mm7   	/* copy MM7 into MM2 */
				psrlq mm7, 16   	/* shift 1 left word to the right */
				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
				movd mm1, eax   	/* save EAX in MM1 */
				packuswb mm7, mm0   	/* pack division result with saturation */
				movd eax, mm7   	/* copy saturated result into EAX */
				mov [edi], al   	/* copy a byte result into Dest */
				movd eax, mm1   	/* restore saved EAX */
				/* --, */
				movd esi, mm6   	/* move Src pointer to the top pixel */
				sub edx, 104   	/* EDX = Kernel address */
				inc              esi    	/* move Src  pointer to the next pixel */
				inc              edi    	/* move Dest pointer to the next pixel */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10382    	/* check loop termination, proceed if required */
				add esi, 6   	/* move to the next row in Src */
				add edi, 6   	/* move to the next row in Dest */
				dec              ebx    	/* decrease loop counter ROWS */
				jnz            L10380    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add          $3, %%edi \n\t"	/* 3 column offset from the left edge */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			"add       %%eax, %%edi \n\t"	/* 3 row offset from the top edge */
			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
			"sub          $6, %%ebx \n\t"	/* do not use first 3 and last 3 rows */
			/* --- */
			".L10380:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
			"sub          $6, %%ecx \n\t"	/* do not use first 3 and last 3 columns */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10382:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
			/* --- 1 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 2 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 3 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 4 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 5 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 6 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- */
			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
			/* -- */
			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
			"sub        $104, %%edx \n\t"	/* EDX = Kernel address */
			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10382 \n\t"	/* check loop termination, proceed if required */
			"add          $6, %%esi \n\t"	/* move to the next row in Src */
			"add          $6, %%edi \n\t"	/* move to the next row in Dest */
			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10380 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns),		/* %3 */
			"m"(Kernel),		/* %4 */
			"m"(NRightShift)	/* %5 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/*!
\brief Filter using ConvolveKernel9x9ShiftRight: Dij = saturation255( ... )

\param Src The source 2D byte array to convolve. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >8.
\param columns Number of columns in source/destination array. Must be >8.
\param Kernel The 2D convolution kernel of size 9x9.
\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
											   signed short *Kernel, unsigned char NRightShift)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
		return(-1);

	if ((columns < 9) || (rows < 9) || (NRightShift > 7))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				xor ebx, ebx   	/* zero EBX */
				mov bl, NRightShift   	/* load NRightShift into BL */
				movd mm5, ebx   	/* copy NRightShift into MM5 */
				mov edx, Kernel   	/* load Kernel address into EDX */
				mov esi, Src   	/* load Src  address to ESI */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, 4   	/* 4 column offset from the left edge */
				mov eax, columns   	/* load columns into EAX */
				add edi, eax   	/* 4 row offset from the top edge */
				add edi, eax
				add edi, eax
				add edi, eax
				mov ebx, rows   	/* initialize ROWS counter */
				sub ebx, 8   	/* do not use first 4 and last 4 rows */
				/* ---, */
L10390:
			mov ecx, eax   	/* initialize COLUMNS counter */
				sub ecx, 8   	/* do not use first 4 and last 4 columns */
				align 16                 	/* 16 byte alignment of the loop entry */
L10392:
			pxor mm7, mm7   	/* zero MM7 (accumulator) */
				movd mm6, esi   	/* save ESI in MM6 */
				/* --- 1 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 2 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 3 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 4 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 5 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 6 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 8 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			dec              esi
				add esi, eax   	/* move Src pointer 1 row below */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* --- 9 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm2, mm1   	/* copy MM1 into MM2 */
				inc              esi    	/* move pointer to the next 8 bytes of Src */
				movq mm3, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				movq mm4, [edx]   	/* load 4 words of Kernel */
			add edx, 8   	/* move pointer to other 4 words */
				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				movq mm1, [esi]   	/* load 8 bytes of the Src */
			movq mm3, [edx]   	/* load 4 words of Kernel */
			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
				/* ---, */
				movq mm3, mm7   	/* copy MM7 into MM3 */
				psrlq mm7, 32   	/* shift 2 left words to the right */
				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
				movq mm2, mm7   	/* copy MM7 into MM2 */
				psrlq mm7, 16   	/* shift 1 left word to the right */
				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
				movd mm1, eax   	/* save EAX in MM1 */
				packuswb mm7, mm0   	/* pack division result with saturation */
				movd eax, mm7   	/* copy saturated result into EAX */
				mov [edi], al   	/* copy a byte result into Dest */
				movd eax, mm1   	/* restore saved EAX */
				/* --, */
				movd esi, mm6   	/* move Src pointer to the top pixel */
				sub edx, 208   	/* EDX = Kernel address */
				inc              esi    	/* move Src  pointer to the next pixel */
				inc              edi    	/* move Dest pointer to the next pixel */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10392    	/* check loop termination, proceed if required */
				add esi, 8   	/* move to the next row in Src */
				add edi, 8   	/* move to the next row in Dest */
				dec              ebx    	/* decrease loop counter ROWS */
				jnz            L10390    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add          $4, %%edi \n\t"	/* 4 column offset from the left edge */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			"add       %%eax, %%edi \n\t"	/* 4 row offset from the top edge */
			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
			"sub          $8, %%ebx \n\t"	/* do not use first 4 and last 4 rows */
			/* --- */
			".L10390:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
			"sub          $8, %%ecx \n\t"	/* do not use first 4 and last 4 columns */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10392:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
			/* --- 1 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 2 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 3 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 4 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 5 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 6 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 8 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- 9 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
			/* --- */
			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
			/* -- */
			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
			"sub        $208, %%edx \n\t"	/* EDX = Kernel address */
			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10392 \n\t"	/* check loop termination, proceed if required */
			"add          $8, %%esi \n\t"	/* move to the next row in Src */
			"add          $8, %%edi \n\t"	/* move to the next row in Dest */
			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10390 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns),		/* %3 */
			"m"(Kernel),		/* %4 */
			"m"(NRightShift)	/* %5 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/* ------------------------------------------------------------------------------------ */

/*!
\brief Filter using SobelX: Dij = saturation255( ... )

\param Src The source 2D byte array to sobel-filter. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >2.
\param columns Number of columns in source/destination array. Must be >7.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL))
		return(-1);

	if ((columns < 8) || (rows < 3))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				mov eax, columns   	/* load columns into EAX */
				/* ---, */
				mov esi, Src   	/* ESI = Src row 0 address */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, eax   	/* EDI = EDI + columns */
				inc              edi    	/* 1 byte offset from the left edge */
				mov edx, rows   	/* initialize ROWS counter */
				sub edx, 2   	/* do not use first and last rows */
				/* ---, */
L10400:
			mov ecx, eax   	/* initialize COLUMS counter */
				shr ecx, 3   	/* EBX/8 (MMX loads 8 bytes at a time) */
				mov ebx, esi   	/* save ESI in EBX */
				movd mm1, edi   	/* save EDI in MM1 */
				align 16                 	/* 16 byte alignment of the loop entry */
L10402:
			/* ---, */
			movq mm4, [esi]   	/* load 8 bytes from Src */
			movq mm5, mm4   	/* save MM4 in MM5 */
				add esi, 2   	/* move ESI pointer 2 bytes right */
				punpcklbw mm4, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm5, mm0   	/* unpack 4 high bytes into words */
				movq mm6, [esi]   	/* load 8 bytes from Src */
			movq mm7, mm6   	/* save MM6 in MM7 */
				sub esi, 2   	/* move ESI pointer back 2 bytes left */
				punpcklbw mm6, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm7, mm0   	/* unpack 4 high bytes into words */
				add esi, eax   	/* move to the next row of Src */
				movq mm2, [esi]   	/* load 8 bytes from Src */
			movq mm3, mm2   	/* save MM2 in MM3 */
				add esi, 2   	/* move ESI pointer 2 bytes right */
				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
				movq mm2, [esi]   	/* load 8 bytes from Src */
			movq mm3, mm2   	/* save MM2 in MM3 */
				sub esi, 2   	/* move ESI pointer back 2 bytes left */
				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
				add esi, eax   	/* move to the next row of Src */
				movq mm2, [esi]   	/* load 8 bytes from Src */
			movq mm3, mm2   	/* save MM2 in MM3 */
				add esi, 2   	/* move ESI pointer 2 bytes right */
				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
				movq mm2, [esi]   	/* load 8 bytes from Src */
			movq mm3, mm2   	/* save MM2 in MM3 */
				sub esi, 2   	/* move ESI pointer back 2 bytes left */
				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
				/* ---, */
				movq mm2, mm4   	/* copy MM4 into MM2 */
				psrlq mm4, 32   	/* shift 2 left words to the right */
				psubw mm4, mm2   	/* MM4 = MM4 - MM2 */
				movq mm3, mm6   	/* copy MM6 into MM3 */
				psrlq mm6, 32   	/* shift 2 left words to the right */
				psubw mm6, mm3   	/* MM6 = MM6 - MM3 */
				punpckldq mm4, mm6   	/* combine 2 words of MM6 and 2 words of MM4 */
				movq mm2, mm5   	/* copy MM6 into MM2 */
				psrlq mm5, 32   	/* shift 2 left words to the right */
				psubw mm5, mm2   	/* MM5 = MM5 - MM2 */
				movq mm3, mm7   	/* copy MM7 into MM3 */
				psrlq mm7, 32   	/* shift 2 left words to the right */
				psubw mm7, mm3   	/* MM7 = MM7 - MM3 */
				punpckldq mm5, mm7   	/* combine 2 words of MM7 and 2 words of MM5 */
				/* Take abs values of MM4 and MM5 */
				movq mm6, mm4   	/* copy MM4 into MM6 */
				movq mm7, mm5   	/* copy MM5 into MM7 */
				psraw mm6, 15   	/* fill MM6 words with word sign bit */
				psraw mm7, 15   	/* fill MM7 words with word sign bit */
				pxor mm4, mm6   	/* take 1's compliment of only neg words */
				pxor mm5, mm7   	/* take 1's compliment of only neg words */
				psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
				psubsw mm5, mm7   	/* add 1 to only neg words, W-(-1) or W-0 */
				packuswb mm4, mm5   	/* combine and pack/saturate MM5 and MM4 */
				movq [edi], mm4   	/* store result in Dest */
				/* ---, */
				sub esi, eax   	/* move to the current top row in Src */
				sub esi, eax
				add esi, 8   	/* move Src  pointer to the next 8 pixels */
				add edi, 8   	/* move Dest pointer to the next 8 pixels */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10402    	/* check loop termination, proceed if required */
				mov esi, ebx   	/* restore most left current row Src  address */
				movd edi, mm1   	/* restore most left current row Dest address */
				add esi, eax   	/* move to the next row in Src */
				add edi, eax   	/* move to the next row in Dest */
				dec              edx    	/* decrease loop counter ROWS */
				jnz            L10400    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			/* --- */
			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
			"sub          $2, %%edx \n\t"	/* do not use first and last rows */
			/* --- */
			".L10400:                \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
			"shr          $3, %%ecx \n\t"	/* EBX/8 (MMX loads 8 bytes at a time) */
			"mov       %%esi, %%ebx \n\t"	/* save ESI in EBX */
			"movd      %%edi, %%mm1 \n\t"	/* save EDI in MM1 */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10402:               \n\t"
			/* --- */
			"movq    (%%esi), %%mm4 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm4, %%mm5 \n\t"	/* save MM4 in MM5 */
			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
			"punpcklbw %%mm0, %%mm4 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm5 \n\t"	/* unpack 4 high bytes into words */
			"movq    (%%esi), %%mm6 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm6, %%mm7 \n\t"	/* save MM6 in MM7 */
			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
			"punpcklbw %%mm0, %%mm6 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm7 \n\t"	/* unpack 4 high bytes into words */
			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
			/* --- */
			"movq      %%mm4, %%mm2 \n\t"	/* copy MM4 into MM2 */
			"psrlq       $32, %%mm4 \n\t"	/* shift 2 left words to the right */
			"psubw     %%mm2, %%mm4 \n\t"	/* MM4 = MM4 - MM2 */
			"movq      %%mm6, %%mm3 \n\t"	/* copy MM6 into MM3 */
			"psrlq       $32, %%mm6 \n\t"	/* shift 2 left words to the right */
			"psubw     %%mm3, %%mm6 \n\t"	/* MM6 = MM6 - MM3 */
			"punpckldq %%mm6, %%mm4 \n\t"	/* combine 2 words of MM6 and 2 words of MM4 */
			"movq      %%mm5, %%mm2 \n\t"	/* copy MM6 into MM2 */
			"psrlq       $32, %%mm5 \n\t"	/* shift 2 left words to the right */
			"psubw     %%mm2, %%mm5 \n\t"	/* MM5 = MM5 - MM2 */
			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
			"psubw     %%mm3, %%mm7 \n\t"	/* MM7 = MM7 - MM3 */
			"punpckldq %%mm7, %%mm5 \n\t"	/* combine 2 words of MM7 and 2 words of MM5 */
			/* Take abs values of MM4 and MM5 */
			"movq      %%mm4, %%mm6 \n\t"	/* copy MM4 into MM6 */
			"movq      %%mm5, %%mm7 \n\t"	/* copy MM5 into MM7 */
			"psraw       $15, %%mm6 \n\t"	/* fill MM6 words with word sign bit */
			"psraw       $15, %%mm7 \n\t"	/* fill MM7 words with word sign bit */
			"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
			"pxor      %%mm7, %%mm5 \n\t"	/* take 1's compliment of only neg. words */
			"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
			"psubsw    %%mm7, %%mm5 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
			"packuswb  %%mm5, %%mm4 \n\t"	/* combine and pack/saturate MM5 and MM4 */
			"movq    %%mm4, (%%edi) \n\t"	/* store result in Dest */
			/* --- */
			"sub       %%eax, %%esi \n\t"	/* move to the current top row in Src */
			"sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"	/* move Src  pointer to the next 8 pixels */
			"add $8,          %%edi \n\t"	/* move Dest pointer to the next 8 pixels */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10402 \n\t"	/* check loop termination, proceed if required */
			"mov       %%ebx, %%esi \n\t"	/* restore most left current row Src  address */
			"movd      %%mm1, %%edi \n\t"	/* restore most left current row Dest address */
			"add       %%eax, %%esi \n\t"	/* move to the next row in Src */
			"add       %%eax, %%edi \n\t"	/* move to the next row in Dest */
			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10400 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns)		/* %3 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/*!
\brief Filter using SobelXShiftRight: Dij = saturation255( ... )

\param Src The source 2D byte array to sobel-filter. Should be different from destination.
\param Dest The destination 2D byte array to store the result in. Should be different from source.
\param rows Number of rows in source/destination array. Must be >2.
\param columns Number of columns in source/destination array. Must be >8.
\param NRightShift The number of right bit shifts to apply to the filter sum. Must be <7.

Note: Non-MMX implementation not available for this function.

\return Returns 1 if filter was applied, 0 otherwise.
*/
int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
									unsigned char NRightShift)
{
	/* Validate input parameters */
	if ((Src == NULL) || (Dest == NULL))
		return(-1);
	if ((columns < 8) || (rows < 3) || (NRightShift > 7))
		return (-1);

	if ((SDL_imageFilterMMXdetect())) {
//#ifdef USE_MMX
#if defined(USE_MMX) && defined(i386)
#if !defined(GCC__)
		__asm
		{
			pusha
				pxor mm0, mm0   	/* zero MM0 */
				mov eax, columns   	/* load columns into EAX */
				xor ebx, ebx   	/* zero EBX */
				mov bl, NRightShift   	/* load NRightShift into BL */
				movd mm1, ebx   	/* copy NRightShift into MM1 */
				/* ---, */
				mov esi, Src   	/* ESI = Src row 0 address */
				mov edi, Dest   	/* load Dest address to EDI */
				add edi, eax   	/* EDI = EDI + columns */
				inc              edi    	/* 1 byte offset from the left edge */
				/* initialize ROWS counter */
				sub rows, 2   	/* do not use first and last rows */
				/* ---, */
L10410:
			mov ecx, eax   	/* initialize COLUMS counter */
				shr ecx, 3   	/* EBX/8 (MMX loads 8 bytes at a time) */
				mov ebx, esi   	/* save ESI in EBX */
				mov edx, edi   	/* save EDI in EDX */
				align 16                 	/* 16 byte alignment of the loop entry */
L10412:
			/* ---, */
			movq mm4, [esi]   	/* load 8 bytes from Src */
			movq mm5, mm4   	/* save MM4 in MM5 */
				add esi, 2   	/* move ESI pointer 2 bytes right */
				punpcklbw mm4, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm5, mm0   	/* unpack 4 high bytes into words */
				psrlw mm4, mm1   	/* shift right each pixel NshiftRight times */
				psrlw mm5, mm1   	/* shift right each pixel NshiftRight times */
				movq mm6, [esi]   	/* load 8 bytes from Src */
			movq mm7, mm6   	/* save MM6 in MM7 */
				sub esi, 2   	/* move ESI pointer back 2 bytes left */
				punpcklbw mm6, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm7, mm0   	/* unpack 4 high bytes into words */
				psrlw mm6, mm1   	/* shift right each pixel NshiftRight times */
				psrlw mm7, mm1   	/* shift right each pixel NshiftRight times */
				add esi, eax   	/* move to the next row of Src */
				movq mm2, [esi]   	/* load 8 bytes from Src */
			movq mm3, mm2   	/* save MM2 in MM3 */
				add esi, 2   	/* move ESI pointer 2 bytes right */
				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
				movq mm2, [esi]   	/* load 8 bytes from Src */
			movq mm3, mm2   	/* save MM2 in MM3 */
				sub esi, 2   	/* move ESI pointer back 2 bytes left */
				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
				add esi, eax   	/* move to the next row of Src */
				movq mm2, [esi]   	/* load 8 bytes from Src */
			movq mm3, mm2   	/* save MM2 in MM3 */
				add esi, 2   	/* move ESI pointer 2 bytes right */
				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
				movq mm2, [esi]   	/* load 8 bytes from Src */
			movq mm3, mm2   	/* save MM2 in MM3 */
				sub esi, 2   	/* move ESI pointer back 2 bytes left */
				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
				/* ---, */
				movq mm2, mm4   	/* copy MM4 into MM2 */
				psrlq mm4, 32   	/* shift 2 left words to the right */
				psubw mm4, mm2   	/* MM4 = MM4 - MM2 */
				movq mm3, mm6   	/* copy MM6 into MM3 */
				psrlq mm6, 32   	/* shift 2 left words to the right */
				psubw mm6, mm3   	/* MM6 = MM6 - MM3 */
				punpckldq mm4, mm6   	/* combine 2 words of MM6 and 2 words of MM4 */
				movq mm2, mm5   	/* copy MM6 into MM2 */
				psrlq mm5, 32   	/* shift 2 left words to the right */
				psubw mm5, mm2   	/* MM5 = MM5 - MM2 */
				movq mm3, mm7   	/* copy MM7 into MM3 */
				psrlq mm7, 32   	/* shift 2 left words to the right */
				psubw mm7, mm3   	/* MM7 = MM7 - MM3 */
				punpckldq mm5, mm7   	/* combine 2 words of MM7 and 2 words of MM5 */
				/* Take abs values of MM4 and MM5 */
				movq mm6, mm4   	/* copy MM4 into MM6 */
				movq mm7, mm5   	/* copy MM5 into MM7 */
				psraw mm6, 15   	/* fill MM6 words with word sign bit */
				psraw mm7, 15   	/* fill MM7 words with word sign bit */
				pxor mm4, mm6   	/* take 1's compliment of only neg words */
				pxor mm5, mm7   	/* take 1's compliment of only neg words */
				psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
				psubsw mm5, mm7   	/* add 1 to only neg words, W-(-1) or W-0 */
				packuswb mm4, mm5   	/* combine and pack/saturate MM5 and MM4 */
				movq [edi], mm4   	/* store result in Dest */
				/* ---, */
				sub esi, eax   	/* move to the current top row in Src */
				sub esi, eax
				add esi, 8   	/* move Src  pointer to the next 8 pixels */
				add edi, 8   	/* move Dest pointer to the next 8 pixels */
				/* ---, */
				dec              ecx    	/* decrease loop counter COLUMNS */
				jnz            L10412    	/* check loop termination, proceed if required */
				mov esi, ebx   	/* restore most left current row Src  address */
				mov edi, edx   	/* restore most left current row Dest address */
				add esi, eax   	/* move to the next row in Src */
				add edi, eax   	/* move to the next row in Dest */
				dec rows    	/* decrease loop counter ROWS */
				jnz            L10410    	/* check loop termination, proceed if required */
				/* ---, */
				emms                      	/* exit MMX state */
				popa
		}
#else
		asm volatile
			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
			"mov          %3, %%eax \n\t"	/* load columns into EAX */
			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
			"mov           %4, %%bl \n\t"	/* load NRightShift into BL */
			"movd      %%ebx, %%mm1 \n\t"	/* copy NRightShift into MM1 */
			/* --- */
			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
			/* initialize ROWS counter */
			"subl            $2, %2 \n\t"	/* do not use first and last rows */
			/* --- */
			".L10410:                \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
			"shr          $3, %%ecx \n\t"	/* EBX/8 (MMX loads 8 bytes at a time) */
			"mov       %%esi, %%ebx \n\t"	/* save ESI in EBX */
			"mov       %%edi, %%edx \n\t"	/* save EDI in EDX */
			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
			".L10412:               \n\t"
			/* --- */
			"movq    (%%esi), %%mm4 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm4, %%mm5 \n\t"	/* save MM4 in MM5 */
			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
			"punpcklbw %%mm0, %%mm4 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm5 \n\t"	/* unpack 4 high bytes into words */
			"psrlw     %%mm1, %%mm4 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm1, %%mm5 \n\t"	/* shift right each pixel NshiftRight times */
			"movq    (%%esi), %%mm6 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm6, %%mm7 \n\t"	/* save MM6 in MM7 */
			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
			"punpcklbw %%mm0, %%mm6 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm7 \n\t"	/* unpack 4 high bytes into words */
			"psrlw     %%mm1, %%mm6 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm1, %%mm7 \n\t"	/* shift right each pixel NshiftRight times */
			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
			/* --- */
			"movq      %%mm4, %%mm2 \n\t"	/* copy MM4 into MM2 */
			"psrlq       $32, %%mm4 \n\t"	/* shift 2 left words to the right */
			"psubw     %%mm2, %%mm4 \n\t"	/* MM4 = MM4 - MM2 */
			"movq      %%mm6, %%mm3 \n\t"	/* copy MM6 into MM3 */
			"psrlq       $32, %%mm6 \n\t"	/* shift 2 left words to the right */
			"psubw     %%mm3, %%mm6 \n\t"	/* MM6 = MM6 - MM3 */
			"punpckldq %%mm6, %%mm4 \n\t"	/* combine 2 words of MM6 and 2 words of MM4 */
			"movq      %%mm5, %%mm2 \n\t"	/* copy MM6 into MM2 */
			"psrlq       $32, %%mm5 \n\t"	/* shift 2 left words to the right */
			"psubw     %%mm2, %%mm5 \n\t"	/* MM5 = MM5 - MM2 */
			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
			"psubw     %%mm3, %%mm7 \n\t"	/* MM7 = MM7 - MM3 */
			"punpckldq %%mm7, %%mm5 \n\t"	/* combine 2 words of MM7 and 2 words of MM5 */
			/* Take abs values of MM4 and MM5 */
			"movq      %%mm4, %%mm6 \n\t"	/* copy MM4 into MM6 */
			"movq      %%mm5, %%mm7 \n\t"	/* copy MM5 into MM7 */
			"psraw       $15, %%mm6 \n\t"	/* fill MM6 words with word sign bit */
			"psraw       $15, %%mm7 \n\t"	/* fill MM7 words with word sign bit */
			"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
			"pxor      %%mm7, %%mm5 \n\t"	/* take 1's compliment of only neg. words */
			"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
			"psubsw    %%mm7, %%mm5 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
			"packuswb  %%mm5, %%mm4 \n\t"	/* combine and pack/saturate MM5 and MM4 */
			"movq    %%mm4, (%%edi) \n\t"	/* store result in Dest */
			/* --- */
			"sub       %%eax, %%esi \n\t"	/* move to the current top row in Src */
			"sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"	/* move Src  pointer to the next 8 pixels */
			"add $8,          %%edi \n\t"	/* move Dest pointer to the next 8 pixels */
			/* --- */
			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
			"jnz            .L10412 \n\t"	/* check loop termination, proceed if required */
			"mov       %%ebx, %%esi \n\t"	/* restore most left current row Src  address */
			"mov       %%edx, %%edi \n\t"	/* restore most left current row Dest address */
			"add       %%eax, %%esi \n\t"	/* move to the next row in Src */
			"add       %%eax, %%edi \n\t"	/* move to the next row in Dest */
			"decl                %2 \n\t"	/* decrease loop counter ROWS */
			"jnz            .L10410 \n\t"	/* check loop termination, proceed if required */
			/* --- */
			"emms                   \n\t"	/* exit MMX state */
			"popa                   \n\t":"=m" (Dest)	/* %0 */
			:"m"(Src),		/* %1 */
			"m"(rows),		/* %2 */
			"m"(columns),		/* %3 */
			"m"(NRightShift)	/* %4 */
			);
#endif
#endif
		return (0);
	} else {
		/* No non-MMX implementation yet */
		return (-1);
	}
}

/*!
\brief Align stack to 32 byte boundary,
*/
void SDL_imageFilterAlignStack(void)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{				/* --- stack alignment --- */
		mov ebx, esp   	/* load ESP into EBX */
			sub ebx, 4   	/* reserve space on stack for old value of ESP */
			and ebx, -32   	/* align EBX along a 32 byte boundary */
			mov [ebx], esp   	/* save old value of ESP in stack, behind the bndry */
			mov esp, ebx   	/* align ESP along a 32 byte boundary */
	}
#else
	asm volatile
		(				/* --- stack alignment --- */
		"mov       %%esp, %%ebx \n\t"	/* load ESP into EBX */
		"sub          $4, %%ebx \n\t"	/* reserve space on stack for old value of ESP */
		"and        $-32, %%ebx \n\t"	/* align EBX along a 32 byte boundary */
		"mov     %%esp, (%%ebx) \n\t"	/* save old value of ESP in stack, behind the bndry */
		"mov       %%ebx, %%esp \n\t"	/* align ESP along a 32 byte boundary */
		::);
#endif
#endif
}

/*!
\brief Restore previously aligned stack.
*/
void SDL_imageFilterRestoreStack(void)
{
#ifdef USE_MMX
#if !defined(GCC__)
	__asm
	{				/* --- restoring old stack --- */
		mov ebx, [esp]   	/* load old value of ESP */
		mov esp, ebx   	/* restore old value of ESP */
	}
#else
	asm volatile
		(				/* --- restoring old stack --- */
		"mov     (%%esp), %%ebx \n\t"	/* load old value of ESP */
		"mov       %%ebx, %%esp \n\t"	/* restore old value of ESP */
		::);
#endif
#endif
}