diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..4ed2421
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,103 @@
+# libSDL_prizm: build system
+
+cmake_minimum_required(VERSION 3.16)
+project(LibSDL_prizm VERSION 1.2.15 LANGUAGES C)
+
+# Libraries that libexample depends on
+find_package(Gint 2.7.1 REQUIRED)
+
+# Turn include/config.h.in into a proper config.h where the @VAR@ have
+# been replaced; this is how version numbers are maintained. libexample_VERSION
+# is set to "1.0" by the project() command.
+# Note that the input (config.h.in) is relative to the source dir, but the
+# output (config.h) is in the build dir, so it doesn't pollute the Git repo.
+configure_file(./include/SDL_config.h.in include/SDL/SDL_config.h)
+
+set(SOURCES
+	src/SDL.c 
+	src/SDL_error.c 
+	src/SDL_fatal.c 
+	src/audio/SDL_audio.c 
+	src/audio/SDL_audiocvt.c 
+	src/cpuinfo/SDL_cpuinfo.c 
+	src/events/SDL_active.c 
+	src/events/SDL_events.c 
+	src/events/SDL_expose.c 
+	src/events/SDL_keyboard.c 
+	src/events/SDL_mouse.c 
+	src/events/SDL_quit.c 
+	src/events/SDL_resize.c 
+	src/file/SDL_rwops.c 
+	src/joystick/SDL_joystick.c 
+	src/stdlib/SDL_getenv.c 
+	src/stdlib/SDL_iconv.c 
+	src/stdlib/SDL_malloc.c 
+	src/stdlib/SDL_qsort.c 
+	src/stdlib/SDL_stdlib.c 
+	src/stdlib/SDL_string.c 
+	src/thread/SDL_thread.c 
+	src/thread/generic/SDL_syscond.c 
+	src/thread/generic/SDL_sysmutex.c 
+	src/thread/generic/SDL_syssem.c 
+	src/thread/generic/SDL_systhread.c 
+	src/timer/SDL_timer.c 
+	src/timer/prizm/SDL_systimer.c 
+	src/video/SDL_blit.c 
+	src/video/SDL_blit_0.c 
+	src/video/SDL_blit_1.c 
+	src/video/SDL_blit_A.c 
+	src/video/SDL_blit_N.c 
+	src/video/SDL_bmp.c 
+	src/video/SDL_cursor.c 
+	src/video/SDL_gamma.c 
+	src/video/SDL_pixels.c 
+	src/video/SDL_RLEaccel.c 
+	src/video/SDL_stretch.c 
+	src/video/SDL_surface.c 
+	src/video/SDL_video.c 
+	src/video/SDL_yuv.c 
+	src/video/SDL_yuv_mmx.c 
+	src/video/SDL_yuv_sw.c 
+	src/video/prizm/SDL_prizmevents.c 
+	src/video/prizm/SDL_prizmfonts.c 
+	src/video/prizm/SDL_prizmnti.c 
+	src/video/prizm/SDL_prizmutils.c 
+	src/video/prizm/SDL_prizmvideo.c 
+	src/gfx/SDL_framerate.c 
+	src/gfx/SDL_gfxBlitFunc.c 
+	src/gfx/SDL_gfxPrimitives.c 
+	src/gfx/SDL_imageFilter.c 
+	src/gfx/SDL_rotozoom.c
+)
+
+include_directories(
+  "${PROJECT_SOURCE_DIR}/include"
+  "${PROJECT_SOURCE_DIR}/include/SDL"
+  "${PROJECT_BINARY_DIR}/include"
+  "${FXSDK_COMPILER_INSTALL}/include/SDL"
+  "${FXSDK_COMPILER_INSTALL}/include/openlibm")
+add_compile_options(-Os -lm -m4-nofpu -mb -std=c11 -ffreestanding -nostdlib -Wa,--dsp)
+
+# Target name is "SDL_prizm", output file is "libSDL_prizm.a" (by default)
+add_library(SDL_prizm STATIC ${SOURCES})
+
+# After building, install the target (that is, libSDL_prizm.a) in the compiler
+install(TARGETS SDL_prizm
+  DESTINATION "${FXSDK_COMPILER_INSTALL}")
+
+# Also install the headers (our include folder gets merged with the existing
+# one in the compiler's install folder). Only install files matching *.h to
+# exclude config.h.in.
+install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include/SDL"
+  DESTINATION "${FXSDK_COMPILER_INSTALL}/include"
+  FILES_MATCHING PATTERN "*.h")
+# Slyvtt : replaced "*.h" with "*" as some headers are in the C++ style, without .h extension
+
+# Install config.h from the build dir
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/include/SDL/SDL_config.h"
+  DESTINATION "${FXSDK_COMPILER_INSTALL}/include/SDL")
+
+# Install FindSDL_prizm.cmake so that users can do find_package(LibSDL_prizm)
+install(FILES cmake/FindSDL_prizm.cmake
+  DESTINATION "${FXSDK_CMAKE_MODULE_PATH}")
+
diff --git a/Makefile.prizm b/Makefile.prizm
index fced1c9..21e1086 100644
--- a/Makefile.prizm
+++ b/Makefile.prizm
@@ -1,7 +1,7 @@
 # Makefile to build the SDL library
 
-INCLUDE = -I./include
-CFLAGS  = -O2 $(INCLUDE) -m4-nofpu -mb -ffreestanding -nostdlib -Wa,--dsp
+INCLUDE = -I./include -I/home/sylvain/.local/share/giteapc/Lephenixnoir/sh-elf-gcc/lib/gcc/sh3eb-elf/11.1.0/include/openlibm/
+CFLAGS  = -O2 $(INCLUDE) -lm -m4-nofpu -mb -ffreestanding -nostdlib -Wa,--dsp
 AR	= sh-elf-gcc-ar
 RANLIB	= sh-elf-gcc-ranlib
 CC	= sh-elf-gcc
@@ -57,7 +57,13 @@ SOURCES = \
 	src/video/prizm/SDL_prizmfonts.c \
 	src/video/prizm/SDL_prizmnti.c \
 	src/video/prizm/SDL_prizmutils.c \
-	src/video/prizm/SDL_prizmvideo.c
+	src/video/prizm/SDL_prizmvideo.c \
+	src/gfx/SDL_framerate.c \
+	src/gfx/SDL_gfxBlitFunc.c \
+	src/gfx/SDL_gfxPrimitives.c \
+	src/gfx/SDL_imageFilter.c \
+	src/gfx/SDL_rotozoom.c \
+
 
 OBJECTS = $(SOURCES:.c=.o)
 
diff --git a/README.md b/README.md
index e69de29..9ce8734 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,12 @@
+# µSTL 2.3 sources for Casio Graph 90+E
+
+Need to have a fully working gcc toolchain for SH3/SH4 architecture.
+
+Compilation is done by using the adhoc Makefile :
+`make` in the main directory
+
+It should produce the library libustl.a
+
+The following steps are not automatically done, so please proceed with the following manipulations :
+* copy the library libustl.a into your SH3/SH4 compiler lib folder
+* copy all header files contained in the include foler into the include folder of the SH3/SH4 compiler
diff --git a/build b/build
new file mode 100755
index 0000000..b3ae7a6
--- /dev/null
+++ b/build
@@ -0,0 +1,4 @@
+make -f Makefile.prizm clean
+make -f Makefile.prizm
+cp libSDL_prizm.a ~/.local/share/giteapc/Lephenixnoir/sh-elf-gcc/lib/gcc/sh3eb-elf/11.1.0/
+cp ./include/* ~/.local/share/giteapc/Lephenixnoir/sh-elf-gcc/lib/gcc/sh3eb-elf/11.1.0/include/SDL/
diff --git a/cSDL1_2_15.cbp b/cSDL1_2_15.cbp
index 9c19679..5c4ffc1 100644
--- a/cSDL1_2_15.cbp
+++ b/cSDL1_2_15.cbp
@@ -71,42 +71,8 @@
 		<Unit filename="src/audio/SDL_audiocvt.c">
 			<Option compilerVar="CC" />
 		</Unit>
-		<Unit filename="src/audio/SDL_audiodev.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/audio/SDL_audiodev_c.h" />
 		<Unit filename="src/audio/SDL_audiomem.h" />
-		<Unit filename="src/audio/SDL_mixer.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/audio/SDL_mixer_MMX.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/audio/SDL_mixer_MMX.h" />
-		<Unit filename="src/audio/SDL_mixer_MMX_VC.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/audio/SDL_mixer_MMX_VC.h" />
-		<Unit filename="src/audio/SDL_mixer_m68k.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/audio/SDL_mixer_m68k.h" />
 		<Unit filename="src/audio/SDL_sysaudio.h" />
-		<Unit filename="src/audio/SDL_wave.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/audio/SDL_wave.h" />
-		<Unit filename="src/audio/dummy/SDL_dummyaudio.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/audio/dummy/SDL_dummyaudio.h" />
-		<Unit filename="src/cdrom/SDL_cdrom.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/cdrom/SDL_syscdrom.h" />
-		<Unit filename="src/cdrom/dummy/SDL_syscdrom.c">
-			<Option compilerVar="CC" />
-		</Unit>
 		<Unit filename="src/cpuinfo/SDL_cpuinfo.c">
 			<Option compilerVar="CC" />
 		</Unit>
@@ -141,15 +107,6 @@
 		</Unit>
 		<Unit filename="src/joystick/SDL_joystick_c.h" />
 		<Unit filename="src/joystick/SDL_sysjoystick.h" />
-		<Unit filename="src/joystick/dummy/SDL_sysjoystick.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/loadso/dummy/SDL_sysloadso.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/main/dummy/SDL_dummy_main.c">
-			<Option compilerVar="CC" />
-		</Unit>
 		<Unit filename="src/stdlib/SDL_getenv.c">
 			<Option compilerVar="CC" />
 		</Unit>
@@ -192,9 +149,6 @@
 			<Option compilerVar="CC" />
 		</Unit>
 		<Unit filename="src/timer/SDL_timer_c.h" />
-		<Unit filename="src/timer/dummy/SDL_systimer.c">
-			<Option compilerVar="CC" />
-		</Unit>
 		<Unit filename="src/timer/prizm/SDL_systimer.c">
 			<Option compilerVar="CC" />
 		</Unit>
@@ -258,18 +212,6 @@
 		<Unit filename="src/video/SDL_yuvfuncs.h" />
 		<Unit filename="src/video/blank_cursor.h" />
 		<Unit filename="src/video/default_cursor.h" />
-		<Unit filename="src/video/dummy/SDL_nullevents.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/video/dummy/SDL_nullevents_c.h" />
-		<Unit filename="src/video/dummy/SDL_nullmouse.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/video/dummy/SDL_nullmouse_c.h" />
-		<Unit filename="src/video/dummy/SDL_nullvideo.c">
-			<Option compilerVar="CC" />
-		</Unit>
-		<Unit filename="src/video/dummy/SDL_nullvideo.h" />
 		<Unit filename="src/video/e_log.h" />
 		<Unit filename="src/video/e_pow.h" />
 		<Unit filename="src/video/e_sqrt.h" />
diff --git a/cSDL1_2_15.layout b/cSDL1_2_15.layout
index 3d6d100..65d693e 100644
--- a/cSDL1_2_15.layout
+++ b/cSDL1_2_15.layout
@@ -2,9 +2,187 @@
 <CodeBlocks_layout_file>
 	<FileVersion major="1" minor="0" />
 	<ActiveTarget name="Release" />
-	<File name="src/video/prizm/SDL_prizmevents.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+	<File name="src/video/prizm/SDL_prizmevents_c.h" open="0" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
 		<Cursor>
-			<Cursor1 position="1755" topLine="40" />
+			<Cursor1 position="1473" topLine="13" />
+		</Cursor>
+		<Folding>
+			<Collapse line="41" />
+		</Folding>
+	</File>
+	<File name="src/video/prizm/SDL_prizmnti.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="988" topLine="0" />
+		</Cursor>
+		<Folding>
+			<Collapse line="5" />
+		</Folding>
+	</File>
+	<File name="src/events/SDL_events_c.h" open="0" top="0" tabpos="8" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="0" topLine="55" />
+		</Cursor>
+	</File>
+	<File name="src/video/SDL_bmp.c" open="0" top="0" tabpos="4" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="2950" topLine="72" />
+		</Cursor>
+		<Folding>
+			<Collapse line="415" />
+		</Folding>
+	</File>
+	<File name="include/SDL_stdinc.h" open="0" top="0" tabpos="6" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="2433" topLine="65" />
+		</Cursor>
+	</File>
+	<File name="src/SDL.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="1394" topLine="78" />
+		</Cursor>
+	</File>
+	<File name="src/video/SDL_surface.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="14589" topLine="710" />
+		</Cursor>
+	</File>
+	<File name="src/video/SDL_pixels.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="8603" topLine="83" />
+		</Cursor>
+		<Folding>
+			<Collapse line="36" />
+			<Collapse line="38" />
+			<Collapse line="225" />
+			<Collapse line="238" />
+			<Collapse line="251" />
+			<Collapse line="266" />
+			<Collapse line="291" />
+			<Collapse line="313" />
+			<Collapse line="340" />
+			<Collapse line="342" />
+			<Collapse line="355" />
+			<Collapse line="357" />
+			<Collapse line="370" />
+			<Collapse line="404" />
+			<Collapse line="424" />
+			<Collapse line="436" />
+			<Collapse line="464" />
+			<Collapse line="489" />
+			<Collapse line="506" />
+			<Collapse line="530" />
+			<Collapse line="531" />
+			<Collapse line="542" />
+			<Collapse line="617" />
+		</Folding>
+	</File>
+	<File name="src/timer/SDL_systimer.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="1134" topLine="3" />
+		</Cursor>
+	</File>
+	<File name="src/events/SDL_keyboard.c" open="1" top="1" tabpos="3" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="14509" topLine="194" />
+		</Cursor>
+		<Folding>
+			<Collapse line="58" />
+			<Collapse line="337" />
+			<Collapse line="342" />
+			<Collapse line="357" />
+			<Collapse line="368" />
+			<Collapse line="374" />
+			<Collapse line="378" />
+			<Collapse line="383" />
+			<Collapse line="573" />
+			<Collapse line="596" />
+			<Collapse line="609" />
+		</Folding>
+	</File>
+	<File name="src/video/prizm/SDL_prizmvideo.c" open="0" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="6352" topLine="195" />
+		</Cursor>
+	</File>
+	<File name="include/SDL_endian.h" open="0" top="0" tabpos="5" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="3019" topLine="87" />
+		</Cursor>
+	</File>
+	<File name="src/file/SDL_rwops.c" open="0" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="17253" topLine="108" />
+		</Cursor>
+		<Folding>
+			<Collapse line="48" />
+			<Collapse line="186" />
+			<Collapse line="222" />
+			<Collapse line="269" />
+			<Collapse line="301" />
+			<Collapse line="323" />
+			<Collapse line="332" />
+			<Collapse line="342" />
+			<Collapse line="352" />
+			<Collapse line="367" />
+			<Collapse line="394" />
+			<Collapse line="414" />
+			<Collapse line="423" />
+			<Collapse line="428" />
+			<Collapse line="439" />
+			<Collapse line="444" />
+			<Collapse line="483" />
+			<Collapse line="532" />
+			<Collapse line="549" />
+			<Collapse line="566" />
+			<Collapse line="583" />
+			<Collapse line="594" />
+			<Collapse line="608" />
+			<Collapse line="615" />
+			<Collapse line="622" />
+			<Collapse line="629" />
+			<Collapse line="636" />
+			<Collapse line="644" />
+			<Collapse line="649" />
+			<Collapse line="654" />
+			<Collapse line="659" />
+			<Collapse line="664" />
+			<Collapse line="669" />
+		</Folding>
+	</File>
+	<File name="src/video/SDL_video.c" open="0" top="0" tabpos="3" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="29549" topLine="860" />
+		</Cursor>
+		<Folding>
+			<Collapse line="156" />
+			<Collapse line="1047" />
+			<Collapse line="1051" />
+			<Collapse line="1113" />
+		</Folding>
+	</File>
+	<File name="include/SDL_timer.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="0" topLine="21" />
+		</Cursor>
+	</File>
+	<File name="include/SDL_keyboard.h" open="1" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="3548" topLine="100" />
+		</Cursor>
+	</File>
+	<File name="Makefile.prizm" open="0" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="1235" topLine="26" />
+		</Cursor>
+	</File>
+	<File name="src/video/prizm/SDL_prizmfonts.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="113" topLine="2" />
+		</Cursor>
+	</File>
+	<File name="src/SDL_error.c" open="0" top="0" tabpos="3" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="1517" topLine="48" />
 		</Cursor>
 	</File>
 	<File name="src/video/prizm/SDL_prizmvideo.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
@@ -12,57 +190,39 @@
 			<Cursor1 position="0" topLine="11" />
 		</Cursor>
 	</File>
-	<File name="src/video/prizm/SDL_prizmfonts.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+	<File name="src/video/prizm/SDL_prizmevents.c" open="0" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
 		<Cursor>
-			<Cursor1 position="159" topLine="0" />
+			<Cursor1 position="6421" topLine="150" />
 		</Cursor>
 	</File>
-	<File name="src/video/prizm/SDL_prizmutils.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+	<File name="src/stdlib/SDL_string.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
 		<Cursor>
-			<Cursor1 position="307" topLine="0" />
+			<Cursor1 position="7025" topLine="273" />
 		</Cursor>
 	</File>
-	<File name="Makefile.prizm" open="0" top="0" tabpos="3" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+	<File name="src/events/SDL_sysevents.h" open="0" top="0" tabpos="9" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
 		<Cursor>
-			<Cursor1 position="1727" topLine="47" />
+			<Cursor1 position="0" topLine="0" />
 		</Cursor>
 	</File>
-	<File name="src/video/prizm/SDL_prizmvideo.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+	<File name="src/events/SDL_events.c" open="0" top="0" tabpos="7" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
 		<Cursor>
-			<Cursor1 position="0" topLine="72" />
+			<Cursor1 position="2769" topLine="6" />
 		</Cursor>
 	</File>
-	<File name="include/SDL_config_prizm.h" open="0" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+	<File name="include/SDL_video.h" open="0" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
 		<Cursor>
-			<Cursor1 position="2746" topLine="12" />
+			<Cursor1 position="8865" topLine="271" />
 		</Cursor>
 	</File>
-	<File name="src/video/SDL_sysvideo.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+	<File name="include/SDL_keysym.h" open="1" top="0" tabpos="4" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
 		<Cursor>
-			<Cursor1 position="12039" topLine="392" />
+			<Cursor1 position="9028" topLine="337" />
 		</Cursor>
 	</File>
-	<File name="include/SDL_config.h" open="0" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+	<File name="include/SDL_config_prizm.h" open="0" top="0" tabpos="4" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
 		<Cursor>
-			<Cursor1 position="1297" topLine="15" />
-		</Cursor>
-	</File>
-	<File name="src/video/prizm/SDL_prizmnti.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
-		<Cursor>
-			<Cursor1 position="762" topLine="3" />
-		</Cursor>
-		<Folding>
-			<Collapse line="5" />
-		</Folding>
-	</File>
-	<File name="src/video/prizm/SDL_prizmfonts.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
-		<Cursor>
-			<Cursor1 position="567" topLine="0" />
-		</Cursor>
-	</File>
-	<File name="src/video/prizm/SDL_prizmevents_c.h" open="0" top="0" tabpos="4" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
-		<Cursor>
-			<Cursor1 position="2002" topLine="69" />
+			<Cursor1 position="2856" topLine="91" />
 		</Cursor>
 	</File>
 	<File name="src/video/SDL_blit.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
@@ -70,9 +230,70 @@
 			<Cursor1 position="1552" topLine="468" />
 		</Cursor>
 	</File>
-	<File name="include/SDL_video.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+	<File name="src/video/SDL_blit_N.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
 		<Cursor>
-			<Cursor1 position="8312" topLine="258" />
+			<Cursor1 position="0" topLine="153" />
+		</Cursor>
+	</File>
+	<File name="include/SDL_error.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="1459" topLine="47" />
+		</Cursor>
+	</File>
+	<File name="include/SDL_config.h" open="0" top="0" tabpos="2" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="1297" topLine="15" />
+		</Cursor>
+	</File>
+	<File name="src/video/prizm/SDL_prizmfonts.h" open="0" top="0" tabpos="5" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="86166" topLine="0" />
+		</Cursor>
+		<Folding>
+			<Collapse line="8" />
+			<Collapse line="268" />
+			<Collapse line="528" />
+			<Collapse line="788" />
+			<Collapse line="1048" />
+		</Folding>
+	</File>
+	<File name="src/timer/prizm/SDL_systimer.c" open="1" top="0" tabpos="1" split="0" active="1" splitpos="0" zoom_1="4" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="1839" topLine="63" />
+		</Cursor>
+	</File>
+	<File name="src/video/SDL_sysvideo.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="12039" topLine="392" />
+		</Cursor>
+	</File>
+	<File name="src/video/SDL_blit_0.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="0" topLine="39" />
+		</Cursor>
+		<Folding>
+			<Collapse line="29" />
+			<Collapse line="82" />
+			<Collapse line="117" />
+			<Collapse line="154" />
+			<Collapse line="190" />
+			<Collapse line="242" />
+			<Collapse line="276" />
+			<Collapse line="309" />
+			<Collapse line="343" />
+			<Collapse line="388" />
+			<Collapse line="438" />
+			<Collapse line="443" />
+		</Folding>
+	</File>
+	<File name="src/video/prizm/SDL_prizmutils.c" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="307" topLine="0" />
+		</Cursor>
+	</File>
+	<File name="src/timer/SDL_timer_c.h" open="0" top="0" tabpos="0" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
+		<Cursor>
+			<Cursor1 position="0" topLine="9" />
 		</Cursor>
 	</File>
 </CodeBlocks_layout_file>
diff --git a/cmake/FindSDL_prizm.cmake b/cmake/FindSDL_prizm.cmake
new file mode 100644
index 0000000..c9d6bde
--- /dev/null
+++ b/cmake/FindSDL_prizm.cmake
@@ -0,0 +1,49 @@
+find_package(Gint 2.7 REQUIRED)
+# Find libexample.a; if we had platform-specific versions we could look for
+# libexample-${FXSDK_PLATFORM}.a instead.
+execute_process(
+  COMMAND ${CMAKE_C_COMPILER} -print-file-name=libSDL_prizm.a
+  OUTPUT_VARIABLE EX_PATH OUTPUT_STRIP_TRAILING_WHITESPACE)
+# EX_PATH is now the full path if libustl.a exists, "libustl.a" otherwise
+if(NOT "${EX_PATH}" STREQUAL "libSDL_prizm.a")
+  # Find the version.h header
+  execute_process(
+    COMMAND ${CMAKE_C_COMPILER} -print-file-name=include/ustl/config.h
+    OUTPUT_VARIABLE EX_CONFIG OUTPUT_STRIP_TRAILING_WHITESPACE)
+  # Extract version information from the config.h header. This command prints
+  # the version on the line matching the regex and deletes every other line.
+  execute_process(
+    COMMAND sed -E "s/#define.*EX_VERSION\\s+\"(\\S+)\"$/\\1/p; d" ${EX_CONFIG}
+    OUTPUT_VARIABLE EX_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE)
+endif()
+# The commands above seem common for gint libraries, so the fxSDK provides a
+# helper function to get that directly. We simply provide the archive name,
+# header file name, macro name, and names for output variables.
+#   include(FindSimpleLibrary)
+#   find_simple_library(libexample.a include/example/config.h "EX_VERSION"
+#     PATH_VAR EX_PATH VERSION_VAR EX_VERSION)
+# This CMake utility will handle the version comparisons and other checks. We
+# just specify:
+#   -> Some variables that are defined only if the library is found (so if
+#      they're undefined, CMake will conclude libexample was not found)
+#   -> The version, so CMake can compare with the user's requested one
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LibSDL_prizm
+  REQUIRED_VARS EX_CONFIG EX_VERSION
+  VERSION_VAR EX_VERSION)
+# We now have a LibExample_FOUND variable, let's create the target that users
+# can link against with target_link_libraries()
+if(LibUstl_FOUND)
+  # This is an imported target, we don't build it, we just claim it's here
+  add_library(LibSDL_prizm::LibSDL_prizm UNKNOWN IMPORTED)
+  # Here we declare the compiler and linker flags that every user of LibExample
+  # needs to use.
+  set_target_properties(LibSDL_prizm::LibSDL_prizm PROPERTIES
+    # If we specify where the library comes from, CMake will watch that file
+    # and relink any user application when the library is updated!
+    IMPORTED_LOCATION "${EX_PATH}"
+    # Linking options
+    INTERFACE_LINK_OPTIONS -lSDL_prizm
+    # Dependencies (for order on the command-line)
+    IMPORTED_LINK_INTERFACE_LIBRARIES Gint::Gint)
+endif()
diff --git a/giteapc.make b/giteapc.make
new file mode 100644
index 0000000..453f68d
--- /dev/null
+++ b/giteapc.make
@@ -0,0 +1,26 @@
+# giteapc: version=1 depends=Lephenixnoir/gint,Lephenixnoir/sh-elf-gcc,Lephenixnoir/fxsdk,Lephenixnoir/OpenLibm,Vhex-Kernel-Core/fxlibc
+-include giteapc-config.make
+
+
+configure:
+	@ fxsdk build-fx -c
+	@ fxsdk build-cg -c
+
+build:
+	@ fxsdk build-fx
+	@ fxsdk build-cg
+
+install:
+	@ fxsdk build-fx install
+	@ fxsdk build-cg install
+
+uninstall:
+	@ if [ -e build-fx/install_manifest.txt ]; then \
+	     xargs rm -f < build-fx/install_manifest.txt; \
+          fi
+	@ if [ -e build-cg/install_manifest.txt ]; then \
+	     xargs rm -f < build-cg/install_manifest.txt; \
+          fi
+
+.PHONY: configure build install uninstall
+
diff --git a/include/SDL.h b/include/SDL/SDL.h
similarity index 100%
rename from include/SDL.h
rename to include/SDL/SDL.h
diff --git a/include/SDL_active.h b/include/SDL/SDL_active.h
similarity index 100%
rename from include/SDL_active.h
rename to include/SDL/SDL_active.h
diff --git a/include/SDL_audio.h b/include/SDL/SDL_audio.h
similarity index 100%
rename from include/SDL_audio.h
rename to include/SDL/SDL_audio.h
diff --git a/include/SDL_byteorder.h b/include/SDL/SDL_byteorder.h
similarity index 100%
rename from include/SDL_byteorder.h
rename to include/SDL/SDL_byteorder.h
diff --git a/include/SDL_cdrom.h b/include/SDL/SDL_cdrom.h
similarity index 100%
rename from include/SDL_cdrom.h
rename to include/SDL/SDL_cdrom.h
diff --git a/include/SDL_config_minimal.h b/include/SDL/SDL_config_minimal.h
similarity index 100%
rename from include/SDL_config_minimal.h
rename to include/SDL/SDL_config_minimal.h
diff --git a/include/SDL_config_prizm.h b/include/SDL/SDL_config_prizm.h
similarity index 92%
rename from include/SDL_config_prizm.h
rename to include/SDL/SDL_config_prizm.h
index b0ec1cf..58eb474 100644
--- a/include/SDL_config_prizm.h
+++ b/include/SDL/SDL_config_prizm.h
@@ -89,12 +89,12 @@
 #define SDL_THREADS_DISABLED	1
 
 
-#define SDL_AUDIO_DRIVER_DUMMY	1
-#define SDL_VIDEO_DRIVER_DUMMY	1
+//#define SDL_AUDIO_DRIVER_DUMMY	1
+//#define SDL_VIDEO_DRIVER_DUMMY	1
 #define SDL_CDROM_DUMMY 1
 #define SDL_JOYSTICK_DUMMY 1
 #define SDL_LOADSO_DUMMY 1
-#define SDL_TIMER_DUMMY 1
+//#define SDL_TIMER_DUMMY 1
 
 
 /* Enable various timer systems */
@@ -108,12 +108,12 @@
 
 /* Fonts; needs to match nsp_font_charmaps in SDL_tinspirefonts.c */
 enum {
-	NSDL_FONT_THIN = 0,
-	NSDL_FONT_SPACE,
-	NSDL_FONT_VGA,
-	NSDL_FONT_FANTASY,
-	NSDL_FONT_TINYTYPE,
-	NSP_NUMFONTS
+	cSDL_FONT_THIN = 0,
+	cSDL_FONT_SPACE,
+	cSDL_FONT_VGA,
+	cSDL_FONT_FANTASY,
+	cSDL_FONT_TINYTYPE,
+	PRZ_NUMFONTS
 };
 
 
diff --git a/include/SDL_copying.h b/include/SDL/SDL_copying.h
similarity index 100%
rename from include/SDL_copying.h
rename to include/SDL/SDL_copying.h
diff --git a/include/SDL_cpuinfo.h b/include/SDL/SDL_cpuinfo.h
similarity index 100%
rename from include/SDL_cpuinfo.h
rename to include/SDL/SDL_cpuinfo.h
diff --git a/include/SDL_endian.h b/include/SDL/SDL_endian.h
similarity index 100%
rename from include/SDL_endian.h
rename to include/SDL/SDL_endian.h
diff --git a/include/SDL_error.h b/include/SDL/SDL_error.h
similarity index 100%
rename from include/SDL_error.h
rename to include/SDL/SDL_error.h
diff --git a/include/SDL_events.h b/include/SDL/SDL_events.h
similarity index 100%
rename from include/SDL_events.h
rename to include/SDL/SDL_events.h
diff --git a/include/SDL/SDL_framerate.h b/include/SDL/SDL_framerate.h
new file mode 100644
index 0000000..521797a
--- /dev/null
+++ b/include/SDL/SDL_framerate.h
@@ -0,0 +1,100 @@
+/*
+
+SDL_framerate.h: framerate manager
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL_framerate_h
+#define _SDL_framerate_h
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* --- */
+
+#include "SDL.h"
+
+	/* --------- Definitions */
+
+	/*!
+	\brief Highest possible rate supported by framerate controller in Hz (1/s).
+	*/
+#define FPS_UPPER_LIMIT		200
+
+	/*!
+	\brief Lowest possible rate supported by framerate controller in Hz (1/s).
+	*/
+#define FPS_LOWER_LIMIT		1
+
+	/*!
+	\brief Default rate of framerate controller in Hz (1/s).
+	*/
+#define FPS_DEFAULT		30
+
+	/*! 
+	\brief Structure holding the state and timing information of the framerate controller. 
+	*/
+	typedef struct {
+		Uint32 framecount;
+		float rateticks;
+		Uint32 baseticks;
+		Uint32 lastticks;
+		Uint32 rate;
+	} FPSmanager;
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL_GFX_DLL_IMPORT)
+#    define SDL_FRAMERATE_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL_GFX_DLL_IMPORT
+#      define SDL_FRAMERATE_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL_FRAMERATE_SCOPE
+#  define SDL_FRAMERATE_SCOPE extern
+#endif
+
+	/* Functions return 0 or value for sucess and -1 for error */
+
+	SDL_FRAMERATE_SCOPE void SDL_initFramerate(FPSmanager * manager);
+	SDL_FRAMERATE_SCOPE int SDL_setFramerate(FPSmanager * manager, Uint32 rate);
+	SDL_FRAMERATE_SCOPE int SDL_getFramerate(FPSmanager * manager);
+	SDL_FRAMERATE_SCOPE int SDL_getFramecount(FPSmanager * manager);
+	SDL_FRAMERATE_SCOPE Uint32 SDL_framerateDelay(FPSmanager * manager);
+
+	/* --- */
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL_framerate_h */
diff --git a/include/SDL_getenv.h b/include/SDL/SDL_getenv.h
similarity index 100%
rename from include/SDL_getenv.h
rename to include/SDL/SDL_getenv.h
diff --git a/include/SDL/SDL_gfxBlitFunc.h b/include/SDL/SDL_gfxBlitFunc.h
new file mode 100644
index 0000000..6491aad
--- /dev/null
+++ b/include/SDL/SDL_gfxBlitFunc.h
@@ -0,0 +1,168 @@
+/* 
+
+SDL_gfxBlitFunc.h: custom blitters
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL_gfxBlitFunc_h
+#define _SDL_gfxBlitFunc_h
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern    "C" {
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "SDL.h"
+#include "SDL_video.h"
+
+
+	extern const unsigned int GFX_ALPHA_ADJUST_ARRAY[256];
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL_GFX_DLL_IMPORT)
+#    define SDL_GFXBLITFUNC_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL_GFX_DLL_IMPORT
+#      define SDL_GFXBLITFUNC_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL_GFXBLITFUNC_SCOPE
+#  define SDL_GFXBLITFUNC_SCOPE extern
+#endif
+
+
+	SDL_GFXBLITFUNC_SCOPE int SDL_gfxBlitRGBA(SDL_Surface * src, SDL_Rect * srcrect, SDL_Surface * dst, SDL_Rect * dstrect);
+
+	SDL_GFXBLITFUNC_SCOPE int SDL_gfxSetAlpha(SDL_Surface * src, Uint8 a);
+
+	SDL_GFXBLITFUNC_SCOPE int SDL_gfxMultiplyAlpha(SDL_Surface * src, Uint8 a);
+
+	/* -------- Macros */
+
+	/* Define SDL macros locally as a substitute for an #include "SDL_blit.h", */
+	/* which doesn't work since the include file doesn't get installed.       */
+
+	/*!
+	\brief The structure passed to the low level blit functions.
+	*/
+	typedef struct {
+		Uint8    *s_pixels;
+		int       s_width;
+		int       s_height;
+		int       s_skip;
+		Uint8    *d_pixels;
+		int       d_width;
+		int       d_height;
+		int       d_skip;
+		void     *aux_data;
+		SDL_PixelFormat *src;
+		Uint8    *table;
+		SDL_PixelFormat *dst;
+	} SDL_gfxBlitInfo;
+
+	/*!
+	\brief Unwrap RGBA values from a pixel using mask, shift and loss for surface.
+	*/
+#define GFX_RGBA_FROM_PIXEL(pixel, fmt, r, g, b, a)				\
+	{									\
+	r = ((pixel&fmt->Rmask)>>fmt->Rshift)<<fmt->Rloss; 		\
+	g = ((pixel&fmt->Gmask)>>fmt->Gshift)<<fmt->Gloss; 		\
+	b = ((pixel&fmt->Bmask)>>fmt->Bshift)<<fmt->Bloss; 		\
+	a = ((pixel&fmt->Amask)>>fmt->Ashift)<<fmt->Aloss;	 	\
+	}
+
+	/*!
+	\brief Disassemble buffer pointer into a pixel and separate RGBA values.
+	*/
+#define GFX_DISASSEMBLE_RGBA(buf, bpp, fmt, pixel, r, g, b, a)			   \
+	do {									   \
+	pixel = *((Uint32 *)(buf));			   		   \
+	GFX_RGBA_FROM_PIXEL(pixel, fmt, r, g, b, a);			   \
+	pixel &= ~fmt->Amask;						   \
+	} while(0)
+
+	/*!
+	\brief Wrap a pixel from RGBA values using mask, shift and loss for surface.
+	*/
+#define GFX_PIXEL_FROM_RGBA(pixel, fmt, r, g, b, a)				\
+	{									\
+	pixel = ((r>>fmt->Rloss)<<fmt->Rshift)|				\
+	((g>>fmt->Gloss)<<fmt->Gshift)|				\
+	((b>>fmt->Bloss)<<fmt->Bshift)|				\
+	((a<<fmt->Aloss)<<fmt->Ashift);				\
+	}
+
+	/*!
+	\brief Assemble pixel into buffer pointer from separate RGBA values.
+	*/
+#define GFX_ASSEMBLE_RGBA(buf, bpp, fmt, r, g, b, a)			\
+	{									\
+	Uint32 pixel;					\
+	\
+	GFX_PIXEL_FROM_RGBA(pixel, fmt, r, g, b, a);	\
+	*((Uint32 *)(buf)) = pixel;			\
+	}
+
+	/*!
+	\brief Blend the RGB values of two pixels based on a source alpha value.
+	*/
+#define GFX_ALPHA_BLEND(sR, sG, sB, A, dR, dG, dB)	\
+	do {						\
+	dR = (((sR-dR)*(A))/255)+dR;		\
+	dG = (((sG-dG)*(A))/255)+dG;		\
+	dB = (((sB-dB)*(A))/255)+dB;		\
+	} while(0)
+
+	/*!
+	\brief 4-times unrolled DUFFs loop.
+
+	This is a very useful loop for optimizing blitters.
+	*/
+#define GFX_DUFFS_LOOP4(pixel_copy_increment, width)			\
+	{ int n = (width+3)/4;							\
+	switch (width & 3) {						\
+	case 0: do {	pixel_copy_increment;				\
+	case 3:		pixel_copy_increment;				\
+	case 2:		pixel_copy_increment;				\
+	case 1:		pixel_copy_increment;				\
+	} while ( --n > 0 );					\
+	}								\
+	}
+
+
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SDL_gfxBlitFunc_h */
diff --git a/include/SDL/SDL_gfxPrimitives.h b/include/SDL/SDL_gfxPrimitives.h
new file mode 100644
index 0000000..c10ce5c
--- /dev/null
+++ b/include/SDL/SDL_gfxPrimitives.h
@@ -0,0 +1,246 @@
+/* 
+
+SDL_gfxPrimitives.h: graphics primitives for SDL
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL_gfxPrimitives_h
+#define _SDL_gfxPrimitives_h
+
+#include <math.h>
+#ifndef M_PI
+#define M_PI	3.1415926535897932384626433832795
+#endif
+
+#include "SDL.h"
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* ----- Versioning */
+
+#define SDL_GFXPRIMITIVES_MAJOR	2
+#define SDL_GFXPRIMITIVES_MINOR	0
+#define SDL_GFXPRIMITIVES_MICRO	25
+
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL_GFX_DLL_IMPORT)
+#    define SDL_GFXPRIMITIVES_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL_GFX_DLL_IMPORT
+#      define SDL_GFXPRIMITIVES_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL_GFXPRIMITIVES_SCOPE
+#  define SDL_GFXPRIMITIVES_SCOPE extern
+#endif
+
+	/* Note: all ___Color routines expect the color to be in format 0xRRGGBBAA */
+
+	/* Pixel */
+
+	SDL_GFXPRIMITIVES_SCOPE int pixelColor(SDL_Surface * dst, Sint16 x, Sint16 y, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int pixelRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Horizontal line */
+
+	SDL_GFXPRIMITIVES_SCOPE int hlineColor(SDL_Surface * dst, Sint16 x1, Sint16 x2, Sint16 y, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int hlineRGBA(SDL_Surface * dst, Sint16 x1, Sint16 x2, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Vertical line */
+
+	SDL_GFXPRIMITIVES_SCOPE int vlineColor(SDL_Surface * dst, Sint16 x, Sint16 y1, Sint16 y2, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int vlineRGBA(SDL_Surface * dst, Sint16 x, Sint16 y1, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Rectangle */
+
+	SDL_GFXPRIMITIVES_SCOPE int rectangleColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int rectangleRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Rounded-Corner Rectangle */
+
+	SDL_GFXPRIMITIVES_SCOPE int roundedRectangleColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int roundedRectangleRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled rectangle (Box) */
+
+	SDL_GFXPRIMITIVES_SCOPE int boxColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int boxRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2,
+		Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Rounded-Corner Filled rectangle (Box) */
+
+	SDL_GFXPRIMITIVES_SCOPE int roundedBoxColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int roundedBoxRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2,
+		Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Line */
+
+	SDL_GFXPRIMITIVES_SCOPE int lineColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int lineRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA Line */
+
+	SDL_GFXPRIMITIVES_SCOPE int aalineColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int aalineRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1,
+		Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Thick Line */
+	SDL_GFXPRIMITIVES_SCOPE int thickLineColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, 
+		Uint8 width, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int thickLineRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, 
+		Uint8 width, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Circle */
+
+	SDL_GFXPRIMITIVES_SCOPE int circleColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int circleRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Arc */
+
+	SDL_GFXPRIMITIVES_SCOPE int arcColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int arcRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, 
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA Circle */
+
+	SDL_GFXPRIMITIVES_SCOPE int aacircleColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int aacircleRGBA(SDL_Surface * dst, Sint16 x, Sint16 y,
+		Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Circle */
+
+	SDL_GFXPRIMITIVES_SCOPE int filledCircleColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 r, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int filledCircleRGBA(SDL_Surface * dst, Sint16 x, Sint16 y,
+		Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Ellipse */
+
+	SDL_GFXPRIMITIVES_SCOPE int ellipseColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int ellipseRGBA(SDL_Surface * dst, Sint16 x, Sint16 y,
+		Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA Ellipse */
+
+	SDL_GFXPRIMITIVES_SCOPE int aaellipseColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int aaellipseRGBA(SDL_Surface * dst, Sint16 x, Sint16 y,
+		Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Ellipse */
+
+	SDL_GFXPRIMITIVES_SCOPE int filledEllipseColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int filledEllipseRGBA(SDL_Surface * dst, Sint16 x, Sint16 y,
+		Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Pie */
+
+	SDL_GFXPRIMITIVES_SCOPE int pieColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int pieRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Pie */
+
+	SDL_GFXPRIMITIVES_SCOPE int filledPieColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int filledPieRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad,
+		Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Trigon */
+
+	SDL_GFXPRIMITIVES_SCOPE int trigonColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int trigonRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA-Trigon */
+
+	SDL_GFXPRIMITIVES_SCOPE int aatrigonColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int aatrigonRGBA(SDL_Surface * dst,  Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Trigon */
+
+	SDL_GFXPRIMITIVES_SCOPE int filledTrigonColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int filledTrigonRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+		Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Polygon */
+
+	SDL_GFXPRIMITIVES_SCOPE int polygonColor(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int polygonRGBA(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy,
+		int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* AA-Polygon */
+
+	SDL_GFXPRIMITIVES_SCOPE int aapolygonColor(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int aapolygonRGBA(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy,
+		int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Filled Polygon */
+
+	SDL_GFXPRIMITIVES_SCOPE int filledPolygonColor(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int filledPolygonRGBA(SDL_Surface * dst, const Sint16 * vx,
+		const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+	SDL_GFXPRIMITIVES_SCOPE int texturedPolygon(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, SDL_Surface * texture,int texture_dx,int texture_dy);
+
+	/* (Note: These MT versions are required for multi-threaded operation.) */
+
+	SDL_GFXPRIMITIVES_SCOPE int filledPolygonColorMT(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color, int **polyInts, int *polyAllocated);
+	SDL_GFXPRIMITIVES_SCOPE int filledPolygonRGBAMT(SDL_Surface * dst, const Sint16 * vx,
+		const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a,
+		int **polyInts, int *polyAllocated);
+	SDL_GFXPRIMITIVES_SCOPE int texturedPolygonMT(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, SDL_Surface * texture,int texture_dx,int texture_dy, int **polyInts, int *polyAllocated);
+
+	/* Bezier */
+
+	SDL_GFXPRIMITIVES_SCOPE int bezierColor(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, int s, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int bezierRGBA(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy,
+		int n, int s, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Characters/Strings */
+
+	SDL_GFXPRIMITIVES_SCOPE void gfxPrimitivesSetFont(const void *fontdata, Uint32 cw, Uint32 ch);
+	SDL_GFXPRIMITIVES_SCOPE void gfxPrimitivesSetFontRotation(Uint32 rotation);
+	SDL_GFXPRIMITIVES_SCOPE int characterColor(SDL_Surface * dst, Sint16 x, Sint16 y, char c, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int characterRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, char c, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+	SDL_GFXPRIMITIVES_SCOPE int stringColor(SDL_Surface * dst, Sint16 x, Sint16 y, const char *s, Uint32 color);
+	SDL_GFXPRIMITIVES_SCOPE int stringRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, const char *s, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL_gfxPrimitives_h */
diff --git a/include/SDL/SDL_gfxPrimitives_font.h b/include/SDL/SDL_gfxPrimitives_font.h
new file mode 100644
index 0000000..198a433
--- /dev/null
+++ b/include/SDL/SDL_gfxPrimitives_font.h
@@ -0,0 +1,3082 @@
+
+/* ---- 8x8 font definition ---- */
+
+/*  ZLIB (c) A. Schiffler 2001-2012 */
+
+#define GFX_FONTDATAMAX (8*256)
+
+static unsigned char gfxPrimitivesFontdata[GFX_FONTDATAMAX] = {
+
+	/*
+	* 0 0x00 '^@' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 1 0x01 '^A' 
+	*/
+	0x7e,			/* 01111110 */
+	0x81,			/* 10000001 */
+	0xa5,			/* 10100101 */
+	0x81,			/* 10000001 */
+	0xbd,			/* 10111101 */
+	0x99,			/* 10011001 */
+	0x81,			/* 10000001 */
+	0x7e,			/* 01111110 */
+
+	/*
+	* 2 0x02 '^B' 
+	*/
+	0x7e,			/* 01111110 */
+	0xff,			/* 11111111 */
+	0xdb,			/* 11011011 */
+	0xff,			/* 11111111 */
+	0xc3,			/* 11000011 */
+	0xe7,			/* 11100111 */
+	0xff,			/* 11111111 */
+	0x7e,			/* 01111110 */
+
+	/*
+	* 3 0x03 '^C' 
+	*/
+	0x6c,			/* 01101100 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0x7c,			/* 01111100 */
+	0x38,			/* 00111000 */
+	0x10,			/* 00010000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 4 0x04 '^D' 
+	*/
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x7c,			/* 01111100 */
+	0xfe,			/* 11111110 */
+	0x7c,			/* 01111100 */
+	0x38,			/* 00111000 */
+	0x10,			/* 00010000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 5 0x05 '^E' 
+	*/
+	0x38,			/* 00111000 */
+	0x7c,			/* 01111100 */
+	0x38,			/* 00111000 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0xd6,			/* 11010110 */
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+
+	/*
+	* 6 0x06 '^F' 
+	*/
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x7c,			/* 01111100 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0x7c,			/* 01111100 */
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+
+	/*
+	* 7 0x07 '^G' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 8 0x08 '^H' 
+	*/
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xe7,			/* 11100111 */
+	0xc3,			/* 11000011 */
+	0xc3,			/* 11000011 */
+	0xe7,			/* 11100111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 9 0x09 '^I' 
+	*/
+	0x00,			/* 00000000 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x42,			/* 01000010 */
+	0x42,			/* 01000010 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 10 0x0a '^J' 
+	*/
+	0xff,			/* 11111111 */
+	0xc3,			/* 11000011 */
+	0x99,			/* 10011001 */
+	0xbd,			/* 10111101 */
+	0xbd,			/* 10111101 */
+	0x99,			/* 10011001 */
+	0xc3,			/* 11000011 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 11 0x0b '^K' 
+	*/
+	0x0f,			/* 00001111 */
+	0x07,			/* 00000111 */
+	0x0f,			/* 00001111 */
+	0x7d,			/* 01111101 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x78,			/* 01111000 */
+
+	/*
+	* 12 0x0c '^L' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 13 0x0d '^M' 
+	*/
+	0x3f,			/* 00111111 */
+	0x33,			/* 00110011 */
+	0x3f,			/* 00111111 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x70,			/* 01110000 */
+	0xf0,			/* 11110000 */
+	0xe0,			/* 11100000 */
+
+	/*
+	* 14 0x0e '^N' 
+	*/
+	0x7f,			/* 01111111 */
+	0x63,			/* 01100011 */
+	0x7f,			/* 01111111 */
+	0x63,			/* 01100011 */
+	0x63,			/* 01100011 */
+	0x67,			/* 01100111 */
+	0xe6,			/* 11100110 */
+	0xc0,			/* 11000000 */
+
+	/*
+	* 15 0x0f '^O' 
+	*/
+	0x18,			/* 00011000 */
+	0xdb,			/* 11011011 */
+	0x3c,			/* 00111100 */
+	0xe7,			/* 11100111 */
+	0xe7,			/* 11100111 */
+	0x3c,			/* 00111100 */
+	0xdb,			/* 11011011 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 16 0x10 '^P' 
+	*/
+	0x80,			/* 10000000 */
+	0xe0,			/* 11100000 */
+	0xf8,			/* 11111000 */
+	0xfe,			/* 11111110 */
+	0xf8,			/* 11111000 */
+	0xe0,			/* 11100000 */
+	0x80,			/* 10000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 17 0x11 '^Q' 
+	*/
+	0x02,			/* 00000010 */
+	0x0e,			/* 00001110 */
+	0x3e,			/* 00111110 */
+	0xfe,			/* 11111110 */
+	0x3e,			/* 00111110 */
+	0x0e,			/* 00001110 */
+	0x02,			/* 00000010 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 18 0x12 '^R' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 19 0x13 '^S' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 20 0x14 '^T' 
+	*/
+	0x7f,			/* 01111111 */
+	0xdb,			/* 11011011 */
+	0xdb,			/* 11011011 */
+	0x7b,			/* 01111011 */
+	0x1b,			/* 00011011 */
+	0x1b,			/* 00011011 */
+	0x1b,			/* 00011011 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 21 0x15 '^U' 
+	*/
+	0x3e,			/* 00111110 */
+	0x61,			/* 01100001 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x86,			/* 10000110 */
+	0x7c,			/* 01111100 */
+
+	/*
+	* 22 0x16 '^V' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x7e,			/* 01111110 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 23 0x17 '^W' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 24 0x18 '^X' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 25 0x19 '^Y' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 26 0x1a '^Z' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0xfe,			/* 11111110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 27 0x1b '^[' 
+	*/
+	0x00,			/* 00000000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0xfe,			/* 11111110 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 28 0x1c '^\' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 29 0x1d '^]' 
+	*/
+	0x00,			/* 00000000 */
+	0x24,			/* 00100100 */
+	0x66,			/* 01100110 */
+	0xff,			/* 11111111 */
+	0x66,			/* 01100110 */
+	0x24,			/* 00100100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 30 0x1e '^^' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 31 0x1f '^_' 
+	*/
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0x7e,			/* 01111110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 32 0x20 ' ' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 33 0x21 '!' 
+	*/
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 34 0x22 '"' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x24,			/* 00100100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 35 0x23 '#' 
+	*/
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 36 0x24 '$' 
+	*/
+	0x18,			/* 00011000 */
+	0x3e,			/* 00111110 */
+	0x60,			/* 01100000 */
+	0x3c,			/* 00111100 */
+	0x06,			/* 00000110 */
+	0x7c,			/* 01111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 37 0x25 '%' 
+	*/
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xcc,			/* 11001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x66,			/* 01100110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 38 0x26 '&' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 39 0x27 ''' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 40 0x28 '(' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 41 0x29 ')' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 42 0x2a '*' 
+	*/
+	0x00,			/* 00000000 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0xff,			/* 11111111 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 43 0x2b '+' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 44 0x2c ',' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+
+	/*
+	* 45 0x2d '-' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 46 0x2e '.' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 47 0x2f '/' 
+	*/
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0xc0,			/* 11000000 */
+	0x80,			/* 10000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 48 0x30 '0' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xd6,			/* 11010110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 49 0x31 '1' 
+	*/
+	0x18,			/* 00011000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 50 0x32 '2' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0x06,			/* 00000110 */
+	0x1c,			/* 00011100 */
+	0x30,			/* 00110000 */
+	0x66,			/* 01100110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 51 0x33 '3' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0x06,			/* 00000110 */
+	0x3c,			/* 00111100 */
+	0x06,			/* 00000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 52 0x34 '4' 
+	*/
+	0x1c,			/* 00011100 */
+	0x3c,			/* 00111100 */
+	0x6c,			/* 01101100 */
+	0xcc,			/* 11001100 */
+	0xfe,			/* 11111110 */
+	0x0c,			/* 00001100 */
+	0x1e,			/* 00011110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 53 0x35 '5' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xfc,			/* 11111100 */
+	0x06,			/* 00000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 54 0x36 '6' 
+	*/
+	0x38,			/* 00111000 */
+	0x60,			/* 01100000 */
+	0xc0,			/* 11000000 */
+	0xfc,			/* 11111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 55 0x37 '7' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 56 0x38 '8' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 57 0x39 '9' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7e,			/* 01111110 */
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x78,			/* 01111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 58 0x3a ':' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 59 0x3b ';' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+
+	/*
+	* 60 0x3c '<' 
+	*/
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x06,			/* 00000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 61 0x3d '=' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 62 0x3e '>' 
+	*/
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 63 0x3f '?' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 64 0x40 '@' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xde,			/* 11011110 */
+	0xde,			/* 11011110 */
+	0xde,			/* 11011110 */
+	0xc0,			/* 11000000 */
+	0x78,			/* 01111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 65 0x41 'A' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 66 0x42 'B' 
+	*/
+	0xfc,			/* 11111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0xfc,			/* 11111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 67 0x43 'C' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 68 0x44 'D' 
+	*/
+	0xf8,			/* 11111000 */
+	0x6c,			/* 01101100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x6c,			/* 01101100 */
+	0xf8,			/* 11111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 69 0x45 'E' 
+	*/
+	0xfe,			/* 11111110 */
+	0x62,			/* 01100010 */
+	0x68,			/* 01101000 */
+	0x78,			/* 01111000 */
+	0x68,			/* 01101000 */
+	0x62,			/* 01100010 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 70 0x46 'F' 
+	*/
+	0xfe,			/* 11111110 */
+	0x62,			/* 01100010 */
+	0x68,			/* 01101000 */
+	0x78,			/* 01111000 */
+	0x68,			/* 01101000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 71 0x47 'G' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xce,			/* 11001110 */
+	0x66,			/* 01100110 */
+	0x3a,			/* 00111010 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 72 0x48 'H' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 73 0x49 'I' 
+	*/
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 74 0x4a 'J' 
+	*/
+	0x1e,			/* 00011110 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x78,			/* 01111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 75 0x4b 'K' 
+	*/
+	0xe6,			/* 11100110 */
+	0x66,			/* 01100110 */
+	0x6c,			/* 01101100 */
+	0x78,			/* 01111000 */
+	0x6c,			/* 01101100 */
+	0x66,			/* 01100110 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 76 0x4c 'L' 
+	*/
+	0xf0,			/* 11110000 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0x62,			/* 01100010 */
+	0x66,			/* 01100110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 77 0x4d 'M' 
+	*/
+	0xc6,			/* 11000110 */
+	0xee,			/* 11101110 */
+	0xfe,			/* 11111110 */
+	0xfe,			/* 11111110 */
+	0xd6,			/* 11010110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 78 0x4e 'N' 
+	*/
+	0xc6,			/* 11000110 */
+	0xe6,			/* 11100110 */
+	0xf6,			/* 11110110 */
+	0xde,			/* 11011110 */
+	0xce,			/* 11001110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 79 0x4f 'O' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 80 0x50 'P' 
+	*/
+	0xfc,			/* 11111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 81 0x51 'Q' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xce,			/* 11001110 */
+	0x7c,			/* 01111100 */
+	0x0e,			/* 00001110 */
+
+	/*
+	* 82 0x52 'R' 
+	*/
+	0xfc,			/* 11111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x6c,			/* 01101100 */
+	0x66,			/* 01100110 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 83 0x53 'S' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 84 0x54 'T' 
+	*/
+	0x7e,			/* 01111110 */
+	0x7e,			/* 01111110 */
+	0x5a,			/* 01011010 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 85 0x55 'U' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 86 0x56 'V' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 87 0x57 'W' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 88 0x58 'X' 
+	*/
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 89 0x59 'Y' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 90 0x5a 'Z' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x8c,			/* 10001100 */
+	0x18,			/* 00011000 */
+	0x32,			/* 00110010 */
+	0x66,			/* 01100110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 91 0x5b '[' 
+	*/
+	0x3c,			/* 00111100 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 92 0x5c '\' 
+	*/
+	0xc0,			/* 11000000 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x06,			/* 00000110 */
+	0x02,			/* 00000010 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 93 0x5d ']' 
+	*/
+	0x3c,			/* 00111100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 94 0x5e '^' 
+	*/
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 95 0x5f '_' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 96 0x60 '`' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 97 0x61 'a' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 98 0x62 'b' 
+	*/
+	0xe0,			/* 11100000 */
+	0x60,			/* 01100000 */
+	0x7c,			/* 01111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 99 0x63 'c' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc0,			/* 11000000 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 100 0x64 'd' 
+	*/
+	0x1c,			/* 00011100 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 101 0x65 'e' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 102 0x66 'f' 
+	*/
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x60,			/* 01100000 */
+	0xf8,			/* 11111000 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 103 0x67 'g' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x7c,			/* 01111100 */
+	0x0c,			/* 00001100 */
+	0xf8,			/* 11111000 */
+
+	/*
+	* 104 0x68 'h' 
+	*/
+	0xe0,			/* 11100000 */
+	0x60,			/* 01100000 */
+	0x6c,			/* 01101100 */
+	0x76,			/* 01110110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 105 0x69 'i' 
+	*/
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 106 0x6a 'j' 
+	*/
+	0x06,			/* 00000110 */
+	0x00,			/* 00000000 */
+	0x06,			/* 00000110 */
+	0x06,			/* 00000110 */
+	0x06,			/* 00000110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+
+	/*
+	* 107 0x6b 'k' 
+	*/
+	0xe0,			/* 11100000 */
+	0x60,			/* 01100000 */
+	0x66,			/* 01100110 */
+	0x6c,			/* 01101100 */
+	0x78,			/* 01111000 */
+	0x6c,			/* 01101100 */
+	0xe6,			/* 11100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 108 0x6c 'l' 
+	*/
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 109 0x6d 'm' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xec,			/* 11101100 */
+	0xfe,			/* 11111110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 110 0x6e 'n' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 111 0x6f 'o' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 112 0x70 'p' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+
+	/*
+	* 113 0x71 'q' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x7c,			/* 01111100 */
+	0x0c,			/* 00001100 */
+	0x1e,			/* 00011110 */
+
+	/*
+	* 114 0x72 'r' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x76,			/* 01110110 */
+	0x60,			/* 01100000 */
+	0x60,			/* 01100000 */
+	0xf0,			/* 11110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 115 0x73 's' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x06,			/* 00000110 */
+	0xfc,			/* 11111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 116 0x74 't' 
+	*/
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0xfc,			/* 11111100 */
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x36,			/* 00110110 */
+	0x1c,			/* 00011100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 117 0x75 'u' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 118 0x76 'v' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 119 0x77 'w' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xd6,			/* 11010110 */
+	0xd6,			/* 11010110 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 120 0x78 'x' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 121 0x79 'y' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7e,			/* 01111110 */
+	0x06,			/* 00000110 */
+	0xfc,			/* 11111100 */
+
+	/*
+	* 122 0x7a 'z' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x4c,			/* 01001100 */
+	0x18,			/* 00011000 */
+	0x32,			/* 00110010 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 123 0x7b '{' 
+	*/
+	0x0e,			/* 00001110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x70,			/* 01110000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x0e,			/* 00001110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 124 0x7c '|' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 125 0x7d '}' 
+	*/
+	0x70,			/* 01110000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x0e,			/* 00001110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x70,			/* 01110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 126 0x7e '~' 
+	*/
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 127 0x7f '' 
+	*/
+	0x00,			/* 00000000 */
+	0x10,			/* 00010000 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 128 0x80 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x0c,			/* 00001100 */
+	0x78,			/* 01111000 */
+
+	/*
+	* 129 0x81 '�' 
+	*/
+	0xcc,			/* 11001100 */
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 130 0x82 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 131 0x83 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 132 0x84 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 133 0x85 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 134 0x86 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x30,			/* 00110000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 135 0x87 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x7e,			/* 01111110 */
+	0x0c,			/* 00001100 */
+	0x38,			/* 00111000 */
+
+	/*
+	* 136 0x88 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 137 0x89 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 138 0x8a '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 139 0x8b '�' 
+	*/
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 140 0x8c '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 141 0x8d '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 142 0x8e '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 143 0x8f '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 144 0x90 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0xf8,			/* 11111000 */
+	0xc0,			/* 11000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 145 0x91 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0xd8,			/* 11011000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 146 0x92 '�' 
+	*/
+	0x3e,			/* 00111110 */
+	0x6c,			/* 01101100 */
+	0xcc,			/* 11001100 */
+	0xfe,			/* 11111110 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xce,			/* 11001110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 147 0x93 '�' 
+	*/
+	0x7c,			/* 01111100 */
+	0x82,			/* 10000010 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 148 0x94 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 149 0x95 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 150 0x96 '�' 
+	*/
+	0x78,			/* 01111000 */
+	0x84,			/* 10000100 */
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 151 0x97 '�' 
+	*/
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 152 0x98 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7e,			/* 01111110 */
+	0x06,			/* 00000110 */
+	0xfc,			/* 11111100 */
+
+	/*
+	* 153 0x99 '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 154 0x9a '�' 
+	*/
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 155 0x9b '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 156 0x9c '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x64,			/* 01100100 */
+	0xf0,			/* 11110000 */
+	0x60,			/* 01100000 */
+	0x66,			/* 01100110 */
+	0xfc,			/* 11111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 157 0x9d '�' 
+	*/
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 158 0x9e '�' 
+	*/
+	0xf8,			/* 11111000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xfa,			/* 11111010 */
+	0xc6,			/* 11000110 */
+	0xcf,			/* 11001111 */
+	0xc6,			/* 11000110 */
+	0xc7,			/* 11000111 */
+
+	/*
+	* 159 0x9f '�' 
+	*/
+	0x0e,			/* 00001110 */
+	0x1b,			/* 00011011 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0xd8,			/* 11011000 */
+	0x70,			/* 01110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 160 0xa0 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x7c,			/* 01111100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 161 0xa1 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x38,			/* 00111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 162 0xa2 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 163 0xa3 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 164 0xa4 '�' 
+	*/
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0xdc,			/* 11011100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 165 0xa5 '�' 
+	*/
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0xe6,			/* 11100110 */
+	0xf6,			/* 11110110 */
+	0xde,			/* 11011110 */
+	0xce,			/* 11001110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 166 0xa6 '�' 
+	*/
+	0x3c,			/* 00111100 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x3e,			/* 00111110 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 167 0xa7 '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 168 0xa8 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x63,			/* 01100011 */
+	0x3e,			/* 00111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 169 0xa9 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 170 0xaa '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x06,			/* 00000110 */
+	0x06,			/* 00000110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 171 0xab '�' 
+	*/
+	0x63,			/* 01100011 */
+	0xe6,			/* 11100110 */
+	0x6c,			/* 01101100 */
+	0x7e,			/* 01111110 */
+	0x33,			/* 00110011 */
+	0x66,			/* 01100110 */
+	0xcc,			/* 11001100 */
+	0x0f,			/* 00001111 */
+
+	/*
+	* 172 0xac '�' 
+	*/
+	0x63,			/* 01100011 */
+	0xe6,			/* 11100110 */
+	0x6c,			/* 01101100 */
+	0x7a,			/* 01111010 */
+	0x36,			/* 00110110 */
+	0x6a,			/* 01101010 */
+	0xdf,			/* 11011111 */
+	0x06,			/* 00000110 */
+
+	/*
+	* 173 0xad '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 174 0xae '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x33,			/* 00110011 */
+	0x66,			/* 01100110 */
+	0xcc,			/* 11001100 */
+	0x66,			/* 01100110 */
+	0x33,			/* 00110011 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 175 0xaf '�' 
+	*/
+	0x00,			/* 00000000 */
+	0xcc,			/* 11001100 */
+	0x66,			/* 01100110 */
+	0x33,			/* 00110011 */
+	0x66,			/* 01100110 */
+	0xcc,			/* 11001100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 176 0xb0 '�' 
+	*/
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+	0x22,			/* 00100010 */
+	0x88,			/* 10001000 */
+
+	/*
+	* 177 0xb1 '�' 
+	*/
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+	0x55,			/* 01010101 */
+	0xaa,			/* 10101010 */
+
+	/*
+	* 178 0xb2 '�' 
+	*/
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+	0x77,			/* 01110111 */
+	0xdd,			/* 11011101 */
+
+	/*
+	* 179 0xb3 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 180 0xb4 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 181 0xb5 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 182 0xb6 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf6,			/* 11110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 183 0xb7 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 184 0xb8 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 185 0xb9 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf6,			/* 11110110 */
+	0x06,			/* 00000110 */
+	0xf6,			/* 11110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 186 0xba '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 187 0xbb '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x06,			/* 00000110 */
+	0xf6,			/* 11110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 188 0xbc '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf6,			/* 11110110 */
+	0x06,			/* 00000110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 189 0xbd '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 190 0xbe '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 191 0xbf '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xf8,			/* 11111000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 192 0xc0 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 193 0xc1 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 194 0xc2 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 195 0xc3 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 196 0xc4 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 197 0xc5 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 198 0xc6 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 199 0xc7 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x37,			/* 00110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 200 0xc8 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x37,			/* 00110111 */
+	0x30,			/* 00110000 */
+	0x3f,			/* 00111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 201 0xc9 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x3f,			/* 00111111 */
+	0x30,			/* 00110000 */
+	0x37,			/* 00110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 202 0xca '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf7,			/* 11110111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 203 0xcb '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xf7,			/* 11110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 204 0xcc '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x37,			/* 00110111 */
+	0x30,			/* 00110000 */
+	0x37,			/* 00110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 205 0xcd '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 206 0xce '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xf7,			/* 11110111 */
+	0x00,			/* 00000000 */
+	0xf7,			/* 11110111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 207 0xcf '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 208 0xd0 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 209 0xd1 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 210 0xd2 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 211 0xd3 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x3f,			/* 00111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 212 0xd4 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 213 0xd5 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 214 0xd6 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x3f,			/* 00111111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 215 0xd7 '�' 
+	*/
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0xff,			/* 11111111 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+
+	/*
+	* 216 0xd8 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0xff,			/* 11111111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 217 0xd9 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xf8,			/* 11111000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 218 0xda '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x1f,			/* 00011111 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 219 0xdb '�' 
+	*/
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 220 0xdc '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+
+	/*
+	* 221 0xdd '�' 
+	*/
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+	0xf0,			/* 11110000 */
+
+	/*
+	* 222 0xde '�' 
+	*/
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+	0x0f,			/* 00001111 */
+
+	/*
+	* 223 0xdf '�' 
+	*/
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0xff,			/* 11111111 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 224 0xe0 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0xc8,			/* 11001000 */
+	0xdc,			/* 11011100 */
+	0x76,			/* 01110110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 225 0xe1 '�' 
+	*/
+	0x78,			/* 01111000 */
+	0xcc,			/* 11001100 */
+	0xcc,			/* 11001100 */
+	0xd8,			/* 11011000 */
+	0xcc,			/* 11001100 */
+	0xc6,			/* 11000110 */
+	0xcc,			/* 11001100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 226 0xe2 '�' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0xc0,			/* 11000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 227 0xe3 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 228 0xe4 '�' 
+	*/
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 229 0xe5 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xd8,			/* 11011000 */
+	0xd8,			/* 11011000 */
+	0xd8,			/* 11011000 */
+	0x70,			/* 01110000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 230 0xe6 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x7c,			/* 01111100 */
+	0xc0,			/* 11000000 */
+
+	/*
+	* 231 0xe7 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 232 0xe8 '�' 
+	*/
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x3c,			/* 00111100 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+
+	/*
+	* 233 0xe9 '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xfe,			/* 11111110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 234 0xea '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0xee,			/* 11101110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 235 0xeb '�' 
+	*/
+	0x0e,			/* 00001110 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x3e,			/* 00111110 */
+	0x66,			/* 01100110 */
+	0x66,			/* 01100110 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 236 0xec '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0xdb,			/* 11011011 */
+	0xdb,			/* 11011011 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 237 0xed '�' 
+	*/
+	0x06,			/* 00000110 */
+	0x0c,			/* 00001100 */
+	0x7e,			/* 01111110 */
+	0xdb,			/* 11011011 */
+	0xdb,			/* 11011011 */
+	0x7e,			/* 01111110 */
+	0x60,			/* 01100000 */
+	0xc0,			/* 11000000 */
+
+	/*
+	* 238 0xee '�' 
+	*/
+	0x1e,			/* 00011110 */
+	0x30,			/* 00110000 */
+	0x60,			/* 01100000 */
+	0x7e,			/* 01111110 */
+	0x60,			/* 01100000 */
+	0x30,			/* 00110000 */
+	0x1e,			/* 00011110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 239 0xef '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x7c,			/* 01111100 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0xc6,			/* 11000110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 240 0xf0 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0xfe,			/* 11111110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 241 0xf1 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x7e,			/* 01111110 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 242 0xf2 '�' 
+	*/
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 243 0xf3 '�' 
+	*/
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x18,			/* 00011000 */
+	0x0c,			/* 00001100 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 244 0xf4 '�' 
+	*/
+	0x0e,			/* 00001110 */
+	0x1b,			/* 00011011 */
+	0x1b,			/* 00011011 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+
+	/*
+	* 245 0xf5 '�' 
+	*/
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0xd8,			/* 11011000 */
+	0xd8,			/* 11011000 */
+	0x70,			/* 01110000 */
+
+	/*
+	* 246 0xf6 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x7e,			/* 01111110 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 247 0xf7 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0x76,			/* 01110110 */
+	0xdc,			/* 11011100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 248 0xf8 '�' 
+	*/
+	0x38,			/* 00111000 */
+	0x6c,			/* 01101100 */
+	0x6c,			/* 01101100 */
+	0x38,			/* 00111000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 249 0xf9 '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 250 0xfa '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x18,			/* 00011000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 251 0xfb '�' 
+	*/
+	0x0f,			/* 00001111 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0x0c,			/* 00001100 */
+	0xec,			/* 11101100 */
+	0x6c,			/* 01101100 */
+	0x3c,			/* 00111100 */
+	0x1c,			/* 00011100 */
+
+	/*
+	* 252 0xfc '�' 
+	*/
+	0x6c,			/* 01101100 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x36,			/* 00110110 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 253 0xfd '�' 
+	*/
+	0x78,			/* 01111000 */
+	0x0c,			/* 00001100 */
+	0x18,			/* 00011000 */
+	0x30,			/* 00110000 */
+	0x7c,			/* 01111100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 254 0xfe '�' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x3c,			/* 00111100 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+	/*
+	* 255 0xff ' ' 
+	*/
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+	0x00,			/* 00000000 */
+
+};
diff --git a/include/SDL/SDL_imageFilter.h b/include/SDL/SDL_imageFilter.h
new file mode 100644
index 0000000..7293a56
--- /dev/null
+++ b/include/SDL/SDL_imageFilter.h
@@ -0,0 +1,215 @@
+/*
+
+SDL_imageFilter.h: byte-image "filter" routines 
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL_imageFilter_h
+#define _SDL_imageFilter_h
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL_GFX_DLL_IMPORT)
+#    define SDL_IMAGEFILTER_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL_GFX_DLL_IMPORT
+#      define SDL_IMAGEFILTER_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL_IMAGEFILTER_SCOPE
+#  define SDL_IMAGEFILTER_SCOPE extern
+#endif
+
+	/* Comments:                                                                           */
+	/*  1.) MMX functions work best if all data blocks are aligned on a 32 bytes boundary. */
+	/*  2.) Data that is not within an 8 byte boundary is processed using the C routine.   */
+	/*  3.) Convolution routines do not have C routines at this time.                      */
+
+	// Detect MMX capability in CPU
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterMMXdetect(void);
+
+	// Force use of MMX off (or turn possible use back on)
+	SDL_IMAGEFILTER_SCOPE void SDL_imageFilterMMXoff(void);
+	SDL_IMAGEFILTER_SCOPE void SDL_imageFilterMMXon(void);
+
+	//
+	// All routines return:
+	//   0   OK
+	//  -1   Error (internal error, parameter error)
+	//
+
+	//  SDL_imageFilterAdd: D = saturation255(S1 + S2)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMean: D = S1/2 + S2/2
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterSub: D = saturation0(S1 - S2)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterAbsDiff: D = | S1 - S2 |
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMult: D = saturation(S1 * S2)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMultNor: D = S1 * S2   (non-MMX)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterMultDivby2: D = saturation255(S1/2 * S2)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest,
+		unsigned int length);
+
+	//  SDL_imageFilterMultDivby4: D = saturation255(S1/2 * S2/2)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest,
+		unsigned int length);
+
+	//  SDL_imageFilterBitAnd: D = S1 & S2
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterBitOr: D = S1 | S2
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterDiv: D = S1 / S2   (non-MMX)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterBitNegation: D = !S
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length);
+
+	//  SDL_imageFilterAddByte: D = saturation255(S + C)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C);
+
+	//  SDL_imageFilterAddUint: D = saturation255(S + (uint)C)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C);
+
+	//  SDL_imageFilterAddByteToHalf: D = saturation255(S/2 + C)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char C);
+
+	//  SDL_imageFilterSubByte: D = saturation0(S - C)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C);
+
+	//  SDL_imageFilterSubUint: D = saturation0(S - (uint)C)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C);
+
+	//  SDL_imageFilterShiftRight: D = saturation0(S >> N)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N);
+
+	//  SDL_imageFilterShiftRightUint: D = saturation0((uint)S >> N)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N);
+
+	//  SDL_imageFilterMultByByte: D = saturation255(S * C)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C);
+
+	//  SDL_imageFilterShiftRightAndMultByByte: D = saturation255((S >> N) * C)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char N, unsigned char C);
+
+	//  SDL_imageFilterShiftLeftByte: D = (S << N)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char N);
+
+	//  SDL_imageFilterShiftLeftUint: D = ((uint)S << N)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char N);
+
+	//  SDL_imageFilterShiftLeft: D = saturation255(S << N)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N);
+
+	//  SDL_imageFilterBinarizeUsingThreshold: D = S >= T ? 255:0
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char T);
+
+	//  SDL_imageFilterClipToRange: D = (S >= Tmin) & (S <= Tmax) 255:0
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length,
+		unsigned char Tmin, unsigned char Tmax);
+
+	//  SDL_imageFilterNormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin,
+		int Cmax, int Nmin, int Nmax);
+
+	/* !!! NO C-ROUTINE FOR THESE FUNCTIONS YET !!! */
+
+	//  SDL_imageFilterConvolveKernel3x3Divide: Dij = saturation0and255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows,
+		int columns, signed short *Kernel, unsigned char Divisor);
+
+	//  SDL_imageFilterConvolveKernel5x5Divide: Dij = saturation0and255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows,
+		int columns, signed short *Kernel, unsigned char Divisor);
+
+	//  SDL_imageFilterConvolveKernel7x7Divide: Dij = saturation0and255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows,
+		int columns, signed short *Kernel, unsigned char Divisor);
+
+	//  SDL_imageFilterConvolveKernel9x9Divide: Dij = saturation0and255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows,
+		int columns, signed short *Kernel, unsigned char Divisor);
+
+	//  SDL_imageFilterConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows,
+		int columns, signed short *Kernel,
+		unsigned char NRightShift);
+
+	//  SDL_imageFilterConvolveKernel5x5ShiftRight: Dij = saturation0and255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows,
+		int columns, signed short *Kernel,
+		unsigned char NRightShift);
+
+	//  SDL_imageFilterConvolveKernel7x7ShiftRight: Dij = saturation0and255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows,
+		int columns, signed short *Kernel,
+		unsigned char NRightShift);
+
+	//  SDL_imageFilterConvolveKernel9x9ShiftRight: Dij = saturation0and255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows,
+		int columns, signed short *Kernel,
+		unsigned char NRightShift);
+
+	//  SDL_imageFilterSobelX: Dij = saturation255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns);
+
+	//  SDL_imageFilterSobelXShiftRight: Dij = saturation255( ... )
+	SDL_IMAGEFILTER_SCOPE int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+		unsigned char NRightShift);
+
+	// Align/restore stack to 32 byte boundary -- Functionality untested! --
+	SDL_IMAGEFILTER_SCOPE void SDL_imageFilterAlignStack(void);
+	SDL_IMAGEFILTER_SCOPE void SDL_imageFilterRestoreStack(void);
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL_imageFilter_h */
diff --git a/include/SDL_joystick.h b/include/SDL/SDL_joystick.h
similarity index 100%
rename from include/SDL_joystick.h
rename to include/SDL/SDL_joystick.h
diff --git a/include/SDL_keyboard.h b/include/SDL/SDL_keyboard.h
similarity index 100%
rename from include/SDL_keyboard.h
rename to include/SDL/SDL_keyboard.h
diff --git a/include/SDL/SDL_keysym.h b/include/SDL/SDL_keysym.h
new file mode 100644
index 0000000..66d4f86
--- /dev/null
+++ b/include/SDL/SDL_keysym.h
@@ -0,0 +1,382 @@
+/*
+    SDL - Simple DirectMedia Layer
+    Copyright (C) 1997-2012 Sam Lantinga
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    Sam Lantinga
+    slouken@libsdl.org
+*/
+
+#ifndef _SDL_keysym_h
+#define _SDL_keysym_h
+
+/** What we really want is a mapping of every raw key on the keyboard.
+ *  To support international keyboards, we use the range 0xA1 - 0xFF
+ *  as international virtual keycodes.  We'll follow in the footsteps of X11...
+ *  @brief The names of the keys
+ */
+typedef enum
+{
+    /** @name ASCII mapped keysyms
+     *  The keyboard syms have been cleverly chosen to map to ASCII
+     */
+    /*@{*/
+    SDLK_UNKNOWN		= 0,
+    SDLK_FIRST		= 0,
+    SDLK_BACKSPACE		= 8,
+    SDLK_TAB		= 9,
+    SDLK_CLEAR		= 12,
+    SDLK_RETURN		= 13,
+    SDLK_PAUSE		= 19,
+    SDLK_ESCAPE		= 27,
+    SDLK_SPACE		= 32,
+    SDLK_EXCLAIM		= 33,
+    SDLK_QUOTEDBL		= 34,
+    SDLK_HASH		= 35,
+    SDLK_DOLLAR		= 36,
+    SDLK_AMPERSAND		= 38,
+    SDLK_QUOTE		= 39,
+    SDLK_LEFTPAREN		= 40,
+    SDLK_RIGHTPAREN		= 41,
+    SDLK_ASTERISK		= 42,
+    SDLK_PLUS		= 43,
+    SDLK_COMMA		= 44,
+    SDLK_MINUS		= 45,
+    SDLK_PERIOD		= 46,
+    SDLK_SLASH		= 47,
+    SDLK_0			= 48,
+    SDLK_1			= 49,
+    SDLK_2			= 50,
+    SDLK_3			= 51,
+    SDLK_4			= 52,
+    SDLK_5			= 53,
+    SDLK_6			= 54,
+    SDLK_7			= 55,
+    SDLK_8			= 56,
+    SDLK_9			= 57,
+    SDLK_COLON		= 58,
+    SDLK_SEMICOLON		= 59,
+    SDLK_LESS		= 60,
+    SDLK_EQUALS		= 61,
+    SDLK_GREATER		= 62,
+    SDLK_QUESTION		= 63,
+    SDLK_AT			= 64,
+    /*
+       Skip uppercase letters
+     */
+    SDLK_LEFTBRACKET	= 91,
+    SDLK_BACKSLASH		= 92,
+    SDLK_RIGHTBRACKET	= 93,
+    SDLK_CARET		= 94,
+    SDLK_UNDERSCORE		= 95,
+    SDLK_BACKQUOTE		= 96,
+    SDLK_a			= 97,
+    SDLK_b			= 98,
+    SDLK_c			= 99,
+    SDLK_d			= 100,
+    SDLK_e			= 101,
+    SDLK_f			= 102,
+    SDLK_g			= 103,
+    SDLK_h			= 104,
+    SDLK_i			= 105,
+    SDLK_j			= 106,
+    SDLK_k			= 107,
+    SDLK_l			= 108,
+    SDLK_m			= 109,
+    SDLK_n			= 110,
+    SDLK_o			= 111,
+    SDLK_p			= 112,
+    SDLK_q			= 113,
+    SDLK_r			= 114,
+    SDLK_s			= 115,
+    SDLK_t			= 116,
+    SDLK_u			= 117,
+    SDLK_v			= 118,
+    SDLK_w			= 119,
+    SDLK_x			= 120,
+    SDLK_y			= 121,
+    SDLK_z			= 122,
+    SDLK_DELETE		= 127,
+    /* End of ASCII mapped keysyms */
+    /*@}*/
+
+    /** @name International keyboard syms */
+    /*@{*/
+    SDLK_WORLD_0		= 160,		/* 0xA0 */
+    SDLK_WORLD_1		= 161,
+    SDLK_WORLD_2		= 162,
+    SDLK_WORLD_3		= 163,
+    SDLK_WORLD_4		= 164,
+    SDLK_WORLD_5		= 165,
+    SDLK_WORLD_6		= 166,
+    SDLK_WORLD_7		= 167,
+    SDLK_WORLD_8		= 168,
+    SDLK_WORLD_9		= 169,
+    SDLK_WORLD_10		= 170,
+    SDLK_WORLD_11		= 171,
+    SDLK_WORLD_12		= 172,
+    SDLK_WORLD_13		= 173,
+    SDLK_WORLD_14		= 174,
+    SDLK_WORLD_15		= 175,
+    SDLK_WORLD_16		= 176,
+    SDLK_WORLD_17		= 177,
+    SDLK_WORLD_18		= 178,
+    SDLK_WORLD_19		= 179,
+    SDLK_WORLD_20		= 180,
+    SDLK_WORLD_21		= 181,
+    SDLK_WORLD_22		= 182,
+    SDLK_WORLD_23		= 183,
+    SDLK_WORLD_24		= 184,
+    SDLK_WORLD_25		= 185,
+    SDLK_WORLD_26		= 186,
+    SDLK_WORLD_27		= 187,
+    SDLK_WORLD_28		= 188,
+    SDLK_WORLD_29		= 189,
+    SDLK_WORLD_30		= 190,
+    SDLK_WORLD_31		= 191,
+    SDLK_WORLD_32		= 192,
+    SDLK_WORLD_33		= 193,
+    SDLK_WORLD_34		= 194,
+    SDLK_WORLD_35		= 195,
+    SDLK_WORLD_36		= 196,
+    SDLK_WORLD_37		= 197,
+    SDLK_WORLD_38		= 198,
+    SDLK_WORLD_39		= 199,
+    SDLK_WORLD_40		= 200,
+    SDLK_WORLD_41		= 201,
+    SDLK_WORLD_42		= 202,
+    SDLK_WORLD_43		= 203,
+    SDLK_WORLD_44		= 204,
+    SDLK_WORLD_45		= 205,
+    SDLK_WORLD_46		= 206,
+    SDLK_WORLD_47		= 207,
+    SDLK_WORLD_48		= 208,
+    SDLK_WORLD_49		= 209,
+    SDLK_WORLD_50		= 210,
+    SDLK_WORLD_51		= 211,
+    SDLK_WORLD_52		= 212,
+    SDLK_WORLD_53		= 213,
+    SDLK_WORLD_54		= 214,
+    SDLK_WORLD_55		= 215,
+    SDLK_WORLD_56		= 216,
+    SDLK_WORLD_57		= 217,
+    SDLK_WORLD_58		= 218,
+    SDLK_WORLD_59		= 219,
+    SDLK_WORLD_60		= 220,
+    SDLK_WORLD_61		= 221,
+    SDLK_WORLD_62		= 222,
+    SDLK_WORLD_63		= 223,
+    SDLK_WORLD_64		= 224,
+    SDLK_WORLD_65		= 225,
+    SDLK_WORLD_66		= 226,
+    SDLK_WORLD_67		= 227,
+    SDLK_WORLD_68		= 228,
+    SDLK_WORLD_69		= 229,
+    SDLK_WORLD_70		= 230,
+    SDLK_WORLD_71		= 231,
+    SDLK_WORLD_72		= 232,
+    SDLK_WORLD_73		= 233,
+    SDLK_WORLD_74		= 234,
+    SDLK_WORLD_75		= 235,
+    SDLK_WORLD_76		= 236,
+    SDLK_WORLD_77		= 237,
+    SDLK_WORLD_78		= 238,
+    SDLK_WORLD_79		= 239,
+    SDLK_WORLD_80		= 240,
+    SDLK_WORLD_81		= 241,
+    SDLK_WORLD_82		= 242,
+    SDLK_WORLD_83		= 243,
+    SDLK_WORLD_84		= 244,
+    SDLK_WORLD_85		= 245,
+    SDLK_WORLD_86		= 246,
+    SDLK_WORLD_87		= 247,
+    SDLK_WORLD_88		= 248,
+    SDLK_WORLD_89		= 249,
+    SDLK_WORLD_90		= 250,
+    SDLK_WORLD_91		= 251,
+    SDLK_WORLD_92		= 252,
+    SDLK_WORLD_93		= 253,
+    SDLK_WORLD_94		= 254,
+    SDLK_WORLD_95		= 255,		/* 0xFF */
+    /*@}*/
+
+    /** @name Numeric keypad */
+    /*@{*/
+    SDLK_KP0		= 256,
+    SDLK_KP1		= 257,
+    SDLK_KP2		= 258,
+    SDLK_KP3		= 259,
+    SDLK_KP4		= 260,
+    SDLK_KP5		= 261,
+    SDLK_KP6		= 262,
+    SDLK_KP7		= 263,
+    SDLK_KP8		= 264,
+    SDLK_KP9		= 265,
+    SDLK_KP_PERIOD		= 266,
+    SDLK_KP_DIVIDE		= 267,
+    SDLK_KP_MULTIPLY	= 268,
+    SDLK_KP_MINUS		= 269,
+    SDLK_KP_PLUS		= 270,
+    SDLK_KP_ENTER		= 271,
+    SDLK_KP_EQUALS		= 272,
+    /*@}*/
+
+    /** @name Arrows + Home/End pad */
+    /*@{*/
+    SDLK_UP			= 273,
+    SDLK_DOWN		= 274,
+    SDLK_RIGHT		= 275,
+    SDLK_LEFT		= 276,
+    SDLK_INSERT		= 277,
+    SDLK_HOME		= 278,
+    SDLK_END		= 279,
+    SDLK_PAGEUP		= 280,
+    SDLK_PAGEDOWN		= 281,
+    /*@}*/
+
+    /** @name Function keys */
+    /*@{*/
+    SDLK_F1			= 282,
+    SDLK_F2			= 283,
+    SDLK_F3			= 284,
+    SDLK_F4			= 285,
+    SDLK_F5			= 286,
+    SDLK_F6			= 287,
+    SDLK_F7			= 288,
+    SDLK_F8			= 289,
+    SDLK_F9			= 290,
+    SDLK_F10		= 291,
+    SDLK_F11		= 292,
+    SDLK_F12		= 293,
+    SDLK_F13		= 294,
+    SDLK_F14		= 295,
+    SDLK_F15		= 296,
+    /*@}*/
+
+    /** @name Key state modifier keys */
+    /*@{*/
+    SDLK_NUMLOCK		= 300,
+    SDLK_CAPSLOCK		= 301,
+    SDLK_SCROLLOCK		= 302,
+    SDLK_RSHIFT		= 303,
+    SDLK_LSHIFT		= 304,
+    SDLK_RCTRL		= 305,
+    SDLK_LCTRL		= 306,
+    SDLK_RALT		= 307,
+    SDLK_LALT		= 308,
+    SDLK_RMETA		= 309,
+    SDLK_LMETA		= 310,
+    SDLK_LSUPER		= 311,		/**< Left "Windows" key */
+    SDLK_RSUPER		= 312,		/**< Right "Windows" key */
+    SDLK_MODE		= 313,		/**< "Alt Gr" key */
+    SDLK_COMPOSE		= 314,		/**< Multi-key compose key */
+    /*@}*/
+
+    /** @name Miscellaneous function keys */
+    /*@{*/
+    SDLK_HELP		= 315,
+    SDLK_PRINT		= 316,
+    SDLK_SYSREQ		= 317,
+    SDLK_BREAK		= 318,
+    SDLK_MENU		= 319,
+    SDLK_POWER		= 320,		/**< Power Macintosh power key */
+    SDLK_EURO		= 321,		/**< Some european keyboards */
+    SDLK_UNDO		= 322,		/**< Atari keyboard has Undo */
+    /*@}*/
+
+    /* Add any other keys here */
+
+    ///PRIZM KEYS
+
+    SDLK_PRZ_KEY_F1	=	360	,
+    SDLK_PRZ_KEY_F2	=	361	,
+    SDLK_PRZ_KEY_F3	=	362	,
+    SDLK_PRZ_KEY_F4	=	363	,
+    SDLK_PRZ_KEY_F5	=	364	,
+    SDLK_PRZ_KEY_F6	=	365	,
+    SDLK_PRZ_KEY_SHIFT	=	366	,
+    SDLK_PRZ_KEY_OPTN	=	367	,
+    SDLK_PRZ_KEY_VARS	=	368	,
+    SDLK_PRZ_KEY_MENU	=	369	,
+    SDLK_PRZ_KEY_LEFT	=	370	,
+    SDLK_PRZ_KEY_UP	=	371	,
+    SDLK_PRZ_KEY_ALPHA	=	372	,
+    SDLK_PRZ_KEY_SQUARE	=	373	,
+    SDLK_PRZ_KEY_POWER	=	374	,
+    SDLK_PRZ_KEY_EXIT	=	375	,
+    SDLK_PRZ_KEY_DOWN	=	376	,
+    SDLK_PRZ_KEY_RIGHT	=	377	,
+    SDLK_PRZ_KEY_XOT	=	378	,
+    SDLK_PRZ_KEY_LOG	=	379	,
+    SDLK_PRZ_KEY_LN	=	380	,
+    SDLK_PRZ_KEY_SIN	=	381	,
+    SDLK_PRZ_KEY_COS	=	382	,
+    SDLK_PRZ_KEY_TAN	=	383	,
+    SDLK_PRZ_KEY_FRAC	=	384	,
+    SDLK_PRZ_KEY_FD	=	385	,
+    SDLK_PRZ_KEY_LEFTP	=	386	,
+    SDLK_PRZ_KEY_RIGHTP	=	387	,
+    SDLK_PRZ_KEY_COMMA	=	388	,
+    SDLK_PRZ_KEY_ARROW	=	389	,
+    SDLK_PRZ_KEY_7	=	390	,
+    SDLK_PRZ_KEY_8	=	391	,
+    SDLK_PRZ_KEY_9	=	392	,
+    SDLK_PRZ_KEY_DEL	=	393	,
+    SDLK_PRZ_KEY_4	=	394	,
+    SDLK_PRZ_KEY_5	=	395	,
+    SDLK_PRZ_KEY_6	=	396	,
+    SDLK_PRZ_KEY_MUL	=	397	,
+    SDLK_PRZ_KEY_DIV	=	398	,
+    SDLK_PRZ_KEY_1	=	399	,
+    SDLK_PRZ_KEY_2	=	400	,
+    SDLK_PRZ_KEY_3	=	401	,
+    SDLK_PRZ_KEY_ADD	=	402	,
+    SDLK_PRZ_KEY_SUB	=	403	,
+    SDLK_PRZ_KEY_0	=	404	,
+    SDLK_PRZ_KEY_DOT	=	405	,
+    SDLK_PRZ_KEY_EXP	=	406	,
+    SDLK_PRZ_KEY_NEG	=	407	,
+    SDLK_PRZ_KEY_EXE	=	408	,
+    SDLK_PRZ_KEY_ACON	=	409	,
+
+
+    SDLK_LAST
+} SDLKey;
+
+/** Enumeration of valid key mods (possibly OR'd together) */
+typedef enum
+{
+    KMOD_NONE  = 0x0000,
+    KMOD_LSHIFT= 0x0001,
+    KMOD_RSHIFT= 0x0002,
+    KMOD_LCTRL = 0x0040,
+    KMOD_RCTRL = 0x0080,
+    KMOD_LALT  = 0x0100,
+    KMOD_RALT  = 0x0200,
+    KMOD_LMETA = 0x0400,
+    KMOD_RMETA = 0x0800,
+    KMOD_NUM   = 0x1000,
+    KMOD_CAPS  = 0x2000,
+    KMOD_MODE  = 0x4000,
+    KMOD_RESERVED = 0x8000
+} SDLMod;
+
+#define KMOD_CTRL	(KMOD_LCTRL|KMOD_RCTRL)
+#define KMOD_SHIFT	(KMOD_LSHIFT|KMOD_RSHIFT)
+#define KMOD_ALT	(KMOD_LALT|KMOD_RALT)
+#define KMOD_META	(KMOD_LMETA|KMOD_RMETA)
+
+#endif /* _SDL_keysym_h */
diff --git a/include/SDL_loadso.h b/include/SDL/SDL_loadso.h
similarity index 100%
rename from include/SDL_loadso.h
rename to include/SDL/SDL_loadso.h
diff --git a/include/SDL_main.h b/include/SDL/SDL_main.h
similarity index 100%
rename from include/SDL_main.h
rename to include/SDL/SDL_main.h
diff --git a/include/SDL_mouse.h b/include/SDL/SDL_mouse.h
similarity index 100%
rename from include/SDL_mouse.h
rename to include/SDL/SDL_mouse.h
diff --git a/include/SDL_mutex.h b/include/SDL/SDL_mutex.h
similarity index 100%
rename from include/SDL_mutex.h
rename to include/SDL/SDL_mutex.h
diff --git a/include/SDL_name.h b/include/SDL/SDL_name.h
similarity index 100%
rename from include/SDL_name.h
rename to include/SDL/SDL_name.h
diff --git a/include/SDL_opengl.h b/include/SDL/SDL_opengl.h
similarity index 100%
rename from include/SDL_opengl.h
rename to include/SDL/SDL_opengl.h
diff --git a/include/SDL_platform.h b/include/SDL/SDL_platform.h
similarity index 100%
rename from include/SDL_platform.h
rename to include/SDL/SDL_platform.h
diff --git a/include/SDL_quit.h b/include/SDL/SDL_quit.h
similarity index 100%
rename from include/SDL_quit.h
rename to include/SDL/SDL_quit.h
diff --git a/include/SDL/SDL_rotozoom.h b/include/SDL/SDL_rotozoom.h
new file mode 100644
index 0000000..80b31f1
--- /dev/null
+++ b/include/SDL/SDL_rotozoom.h
@@ -0,0 +1,123 @@
+/*  
+
+SDL_rotozoom.c: rotozoomer, zoomer and shrinker for 32bit or 8bit surfaces
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifndef _SDL_rotozoom_h
+#define _SDL_rotozoom_h
+
+#include <math.h>
+
+/* Set up for C function definitions, even when using C++ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_PI
+#define M_PI	3.141592654
+#endif
+
+#include "SDL.h"
+
+	/* ---- Defines */
+
+	/*!
+	\brief Disable anti-aliasing (no smoothing).
+	*/
+#define SMOOTHING_OFF		0
+
+	/*!
+	\brief Enable anti-aliasing (smoothing).
+	*/
+#define SMOOTHING_ON		1
+
+	/* ---- Function Prototypes */
+
+#ifdef _MSC_VER
+#  if defined(DLL_EXPORT) && !defined(LIBSDL_GFX_DLL_IMPORT)
+#    define SDL_ROTOZOOM_SCOPE __declspec(dllexport)
+#  else
+#    ifdef LIBSDL_GFX_DLL_IMPORT
+#      define SDL_ROTOZOOM_SCOPE __declspec(dllimport)
+#    endif
+#  endif
+#endif
+#ifndef SDL_ROTOZOOM_SCOPE
+#  define SDL_ROTOZOOM_SCOPE extern
+#endif
+
+	/* 
+
+	Rotozoom functions
+
+	*/
+
+	SDL_ROTOZOOM_SCOPE SDL_Surface *rotozoomSurface(SDL_Surface * src, double angle, double zoom, int smooth);
+
+	SDL_ROTOZOOM_SCOPE SDL_Surface *rotozoomSurfaceXY
+		(SDL_Surface * src, double angle, double zoomx, double zoomy, int smooth);
+
+
+	SDL_ROTOZOOM_SCOPE void rotozoomSurfaceSize(int width, int height, double angle, double zoom, int *dstwidth,
+		int *dstheight);
+
+	SDL_ROTOZOOM_SCOPE void rotozoomSurfaceSizeXY
+		(int width, int height, double angle, double zoomx, double zoomy, 
+		int *dstwidth, int *dstheight);
+
+	/* 
+
+	Zooming functions
+
+	*/
+
+	SDL_ROTOZOOM_SCOPE SDL_Surface *zoomSurface(SDL_Surface * src, double zoomx, double zoomy, int smooth);
+
+	SDL_ROTOZOOM_SCOPE void zoomSurfaceSize(int width, int height, double zoomx, double zoomy, int *dstwidth, int *dstheight);
+
+	/* 
+
+	Shrinking functions
+
+	*/     
+
+	SDL_ROTOZOOM_SCOPE SDL_Surface *shrinkSurface(SDL_Surface * src, int factorx, int factory);
+
+	/* 
+
+	Specialized rotation functions
+
+	*/
+
+	SDL_ROTOZOOM_SCOPE SDL_Surface* rotateSurface90Degrees(SDL_Surface* src, int numClockwiseTurns);
+
+	/* Ends C function definitions when using C++ */
+#ifdef __cplusplus
+}
+#endif
+
+#endif				/* _SDL_rotozoom_h */
diff --git a/include/SDL_rwops.h b/include/SDL/SDL_rwops.h
similarity index 100%
rename from include/SDL_rwops.h
rename to include/SDL/SDL_rwops.h
diff --git a/include/SDL_stdinc.h b/include/SDL/SDL_stdinc.h
similarity index 100%
rename from include/SDL_stdinc.h
rename to include/SDL/SDL_stdinc.h
diff --git a/include/SDL_syswm.h b/include/SDL/SDL_syswm.h
similarity index 100%
rename from include/SDL_syswm.h
rename to include/SDL/SDL_syswm.h
diff --git a/include/SDL_thread.h b/include/SDL/SDL_thread.h
similarity index 100%
rename from include/SDL_thread.h
rename to include/SDL/SDL_thread.h
diff --git a/include/SDL_timer.h b/include/SDL/SDL_timer.h
similarity index 100%
rename from include/SDL_timer.h
rename to include/SDL/SDL_timer.h
diff --git a/include/SDL_types.h b/include/SDL/SDL_types.h
similarity index 100%
rename from include/SDL_types.h
rename to include/SDL/SDL_types.h
diff --git a/include/SDL_version.h b/include/SDL/SDL_version.h
similarity index 100%
rename from include/SDL_version.h
rename to include/SDL/SDL_version.h
diff --git a/include/SDL_video.h b/include/SDL/SDL_video.h
similarity index 97%
rename from include/SDL_video.h
rename to include/SDL/SDL_video.h
index e94aeec..a533391 100644
--- a/include/SDL_video.h
+++ b/include/SDL/SDL_video.h
@@ -260,30 +260,30 @@ typedef enum {
 #define PRZ_FONT_WIDTH  8
 #define PRZ_FONT_HEIGHT 8
 
-typedef struct nSDL_Font {
+typedef struct cSDL_Font {
 	SDL_Surface *chars[PRZ_FONT_NUMCHARS];
 	Uint8 char_width[PRZ_FONT_NUMCHARS];
 	int hspacing, vspacing;
 	SDL_bool monospaced;
-} nSDL_Font;
+} cSDL_Font;
 
-nSDL_Font *nSDL_LoadFont(int font_index, Uint8 r, Uint8 g, Uint8 b);
-void nSDL_SetFontSpacing(nSDL_Font *font, int hspacing, int vspacing);
-void nSDL_EnableFontMonospaced(nSDL_Font *font, SDL_bool toggle);
-void nSDL_FreeFont(nSDL_Font *font);
-int nSDL_DrawString(SDL_Surface *surface, nSDL_Font *font,
+cSDL_Font *cSDL_LoadFont(int font_index, Uint8 r, Uint8 g, Uint8 b);
+void cSDL_SetFontSpacing(cSDL_Font *font, int hspacing, int vspacing);
+void cSDL_EnableFontMonospaced(cSDL_Font *font, SDL_bool toggle);
+void cSDL_FreeFont(cSDL_Font *font);
+int cSDL_DrawString(SDL_Surface *surface, cSDL_Font *font,
 		    int x, int y, const char *format, ...);
-int nSDL_GetStringWidth(nSDL_Font *font, const char *s);
-int nSDL_GetStringHeight(nSDL_Font *font, const char *s);
-SDL_Surface *nSDL_LoadImage(Uint16 *data);
-int nSDL_EnableRelativePaths(char **argv);
+int cSDL_GetStringWidth(cSDL_Font *font, const char *s);
+int cSDL_GetStringHeight(cSDL_Font *font, const char *s);
+SDL_Surface *cSDL_LoadImage(Uint16 *data);
+int cSDL_EnableRelativePaths(char **argv);
 
 #define PRZ_PIXEL_ADDR(origin, x, y, pitch, bpp) ((Uint8 *)origin + ((x) * (bpp)) + ((y) * (pitch)))
 
 #define PXL(bpp) PRZ_PIXEL_ADDR(surface->pixels, x, y, surface->pitch, bpp)
 
 static __inline__ __attribute__((always_inline))
-Uint32 nSDL_GetPixel(SDL_Surface *surface, int x, int y)
+Uint32 cSDL_GetPixel(SDL_Surface *surface, int x, int y)
 {
 	switch ( surface->format->BytesPerPixel ) {
 		case 2: return(*(Uint16 *)PXL(2));
@@ -295,7 +295,7 @@ Uint32 nSDL_GetPixel(SDL_Surface *surface, int x, int y)
 }
 
 static __inline__ __attribute__((always_inline))
-void nSDL_SetPixel(SDL_Surface *surface, int x, int y, Uint32 color)
+void cSDL_SetPixel(SDL_Surface *surface, int x, int y, Uint32 color)
 {
 	switch ( surface->format->BytesPerPixel ) {
 		case 2: *(Uint16 *)PXL(2) = (Uint16)color; return;
@@ -645,7 +645,9 @@ extern DECLSPEC void SDLCALL SDL_UnlockSurface(SDL_Surface *surface);
 extern DECLSPEC SDL_Surface * SDLCALL SDL_LoadBMP_RW(SDL_RWops *src, int freesrc);
 
 /** Convenience macro -- load a surface from a file */
-#define SDL_LoadBMP(file)	SDL_LoadBMP_RW(SDL_RWFromFile(file, "rb"), 1)
+//#define SDL_LoadBMP(file)	SDL_LoadBMP_RW(SDL_RWFromFile(file, "rb"), 1)
+
+extern DECLSPEC SDL_Surface * SDLCALL SDL_LoadBMP(const char *filename );
 
 /**
  * Save a surface to a seekable SDL data source (memory or file.)
diff --git a/include/begin_code.h b/include/SDL/begin_code.h
similarity index 100%
rename from include/begin_code.h
rename to include/SDL/begin_code.h
diff --git a/include/close_code.h b/include/SDL/close_code.h
similarity index 100%
rename from include/close_code.h
rename to include/SDL/close_code.h
diff --git a/src/loadso/dummy/SDL_sysloadso.c b/include/SDL_config.h.in
similarity index 53%
rename from src/loadso/dummy/SDL_sysloadso.c
rename to include/SDL_config.h.in
index f8c9af9..0d9a007 100644
--- a/src/loadso/dummy/SDL_sysloadso.c
+++ b/include/SDL_config.h.in
@@ -19,32 +19,36 @@
     Sam Lantinga
     slouken@libsdl.org
 */
-#include "SDL_config.h"
 
-#if defined(SDL_LOADSO_DUMMY) || defined(SDL_LOADSO_DISABLED)
+#ifndef _SDL_config_h
+#define _SDL_config_h
 
-/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-/* System dependent library loading routines                           */
+#define FXCG50 1
 
-#include "SDL_loadso.h"
+#include "SDL_platform.h"
 
-void *SDL_LoadObject(const char *sofile)
-{
-	const char *loaderror = "SDL_LoadObject() not implemented";
-	SDL_SetError("Failed loading %s: %s", sofile, loaderror);
-	return(NULL);
-}
 
-void *SDL_LoadFunction(void *handle, const char *name)
-{
-	const char *loaderror = "SDL_LoadFunction() not implemented";
-	SDL_SetError("Failed loading %s: %s", name, loaderror);
-	return(NULL);
-}
 
-void SDL_UnloadObject(void *handle)
-{
-    /* no-op. */
-}
+/* Add any platform that doesn't build using the configure system */
+#if defined(__DREAMCAST__)
+#include "SDL_config_dreamcast.h"
+#elif defined(__MACOS__)
+#include "SDL_config_macos.h"
+#elif defined(__MACOSX__)
+#include "SDL_config_macosx.h"
+#elif defined(__SYMBIAN32__)
+#include "SDL_config_symbian.h"  /* must be before win32! */
+#elif defined(__WIN32__)
+#include "SDL_config_win32.h"
+#elif defined(__OS2__)
+#include "SDL_config_os2.h"
 
-#endif /* SDL_LOADSO_DUMMY || SDL_LOADSO_DISABLED */
+/// Added a config_file for Casio fx-CG10/20/50/PRIZM/Graph 90+E
+#elif defined(__CASIOPRIZM__)
+#include "SDL_config_prizm.h"
+
+#else
+#include "SDL_config_minimal.h"
+#endif /* platform config */
+
+#endif /* _SDL_config_h */
diff --git a/include/SDL_keysym.h b/include/SDL_keysym.h
deleted file mode 100644
index f2ad12b..0000000
--- a/include/SDL_keysym.h
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-
-#ifndef _SDL_keysym_h
-#define _SDL_keysym_h
-
-/** What we really want is a mapping of every raw key on the keyboard.
- *  To support international keyboards, we use the range 0xA1 - 0xFF
- *  as international virtual keycodes.  We'll follow in the footsteps of X11...
- *  @brief The names of the keys
- */
-typedef enum {
-        /** @name ASCII mapped keysyms
-         *  The keyboard syms have been cleverly chosen to map to ASCII
-         */
-        /*@{*/
-	SDLK_UNKNOWN		= 0,
-	SDLK_FIRST		= 0,
-	SDLK_BACKSPACE		= 8,
-	SDLK_TAB		= 9,
-	SDLK_CLEAR		= 12,
-	SDLK_RETURN		= 13,
-	SDLK_PAUSE		= 19,
-	SDLK_ESCAPE		= 27,
-	SDLK_SPACE		= 32,
-	SDLK_EXCLAIM		= 33,
-	SDLK_QUOTEDBL		= 34,
-	SDLK_HASH		= 35,
-	SDLK_DOLLAR		= 36,
-	SDLK_AMPERSAND		= 38,
-	SDLK_QUOTE		= 39,
-	SDLK_LEFTPAREN		= 40,
-	SDLK_RIGHTPAREN		= 41,
-	SDLK_ASTERISK		= 42,
-	SDLK_PLUS		= 43,
-	SDLK_COMMA		= 44,
-	SDLK_MINUS		= 45,
-	SDLK_PERIOD		= 46,
-	SDLK_SLASH		= 47,
-	SDLK_0			= 48,
-	SDLK_1			= 49,
-	SDLK_2			= 50,
-	SDLK_3			= 51,
-	SDLK_4			= 52,
-	SDLK_5			= 53,
-	SDLK_6			= 54,
-	SDLK_7			= 55,
-	SDLK_8			= 56,
-	SDLK_9			= 57,
-	SDLK_COLON		= 58,
-	SDLK_SEMICOLON		= 59,
-	SDLK_LESS		= 60,
-	SDLK_EQUALS		= 61,
-	SDLK_GREATER		= 62,
-	SDLK_QUESTION		= 63,
-	SDLK_AT			= 64,
-	/* 
-	   Skip uppercase letters
-	 */
-	SDLK_LEFTBRACKET	= 91,
-	SDLK_BACKSLASH		= 92,
-	SDLK_RIGHTBRACKET	= 93,
-	SDLK_CARET		= 94,
-	SDLK_UNDERSCORE		= 95,
-	SDLK_BACKQUOTE		= 96,
-	SDLK_a			= 97,
-	SDLK_b			= 98,
-	SDLK_c			= 99,
-	SDLK_d			= 100,
-	SDLK_e			= 101,
-	SDLK_f			= 102,
-	SDLK_g			= 103,
-	SDLK_h			= 104,
-	SDLK_i			= 105,
-	SDLK_j			= 106,
-	SDLK_k			= 107,
-	SDLK_l			= 108,
-	SDLK_m			= 109,
-	SDLK_n			= 110,
-	SDLK_o			= 111,
-	SDLK_p			= 112,
-	SDLK_q			= 113,
-	SDLK_r			= 114,
-	SDLK_s			= 115,
-	SDLK_t			= 116,
-	SDLK_u			= 117,
-	SDLK_v			= 118,
-	SDLK_w			= 119,
-	SDLK_x			= 120,
-	SDLK_y			= 121,
-	SDLK_z			= 122,
-	SDLK_DELETE		= 127,
-	/* End of ASCII mapped keysyms */
-        /*@}*/
-
-	/** @name International keyboard syms */
-        /*@{*/
-	SDLK_WORLD_0		= 160,		/* 0xA0 */
-	SDLK_WORLD_1		= 161,
-	SDLK_WORLD_2		= 162,
-	SDLK_WORLD_3		= 163,
-	SDLK_WORLD_4		= 164,
-	SDLK_WORLD_5		= 165,
-	SDLK_WORLD_6		= 166,
-	SDLK_WORLD_7		= 167,
-	SDLK_WORLD_8		= 168,
-	SDLK_WORLD_9		= 169,
-	SDLK_WORLD_10		= 170,
-	SDLK_WORLD_11		= 171,
-	SDLK_WORLD_12		= 172,
-	SDLK_WORLD_13		= 173,
-	SDLK_WORLD_14		= 174,
-	SDLK_WORLD_15		= 175,
-	SDLK_WORLD_16		= 176,
-	SDLK_WORLD_17		= 177,
-	SDLK_WORLD_18		= 178,
-	SDLK_WORLD_19		= 179,
-	SDLK_WORLD_20		= 180,
-	SDLK_WORLD_21		= 181,
-	SDLK_WORLD_22		= 182,
-	SDLK_WORLD_23		= 183,
-	SDLK_WORLD_24		= 184,
-	SDLK_WORLD_25		= 185,
-	SDLK_WORLD_26		= 186,
-	SDLK_WORLD_27		= 187,
-	SDLK_WORLD_28		= 188,
-	SDLK_WORLD_29		= 189,
-	SDLK_WORLD_30		= 190,
-	SDLK_WORLD_31		= 191,
-	SDLK_WORLD_32		= 192,
-	SDLK_WORLD_33		= 193,
-	SDLK_WORLD_34		= 194,
-	SDLK_WORLD_35		= 195,
-	SDLK_WORLD_36		= 196,
-	SDLK_WORLD_37		= 197,
-	SDLK_WORLD_38		= 198,
-	SDLK_WORLD_39		= 199,
-	SDLK_WORLD_40		= 200,
-	SDLK_WORLD_41		= 201,
-	SDLK_WORLD_42		= 202,
-	SDLK_WORLD_43		= 203,
-	SDLK_WORLD_44		= 204,
-	SDLK_WORLD_45		= 205,
-	SDLK_WORLD_46		= 206,
-	SDLK_WORLD_47		= 207,
-	SDLK_WORLD_48		= 208,
-	SDLK_WORLD_49		= 209,
-	SDLK_WORLD_50		= 210,
-	SDLK_WORLD_51		= 211,
-	SDLK_WORLD_52		= 212,
-	SDLK_WORLD_53		= 213,
-	SDLK_WORLD_54		= 214,
-	SDLK_WORLD_55		= 215,
-	SDLK_WORLD_56		= 216,
-	SDLK_WORLD_57		= 217,
-	SDLK_WORLD_58		= 218,
-	SDLK_WORLD_59		= 219,
-	SDLK_WORLD_60		= 220,
-	SDLK_WORLD_61		= 221,
-	SDLK_WORLD_62		= 222,
-	SDLK_WORLD_63		= 223,
-	SDLK_WORLD_64		= 224,
-	SDLK_WORLD_65		= 225,
-	SDLK_WORLD_66		= 226,
-	SDLK_WORLD_67		= 227,
-	SDLK_WORLD_68		= 228,
-	SDLK_WORLD_69		= 229,
-	SDLK_WORLD_70		= 230,
-	SDLK_WORLD_71		= 231,
-	SDLK_WORLD_72		= 232,
-	SDLK_WORLD_73		= 233,
-	SDLK_WORLD_74		= 234,
-	SDLK_WORLD_75		= 235,
-	SDLK_WORLD_76		= 236,
-	SDLK_WORLD_77		= 237,
-	SDLK_WORLD_78		= 238,
-	SDLK_WORLD_79		= 239,
-	SDLK_WORLD_80		= 240,
-	SDLK_WORLD_81		= 241,
-	SDLK_WORLD_82		= 242,
-	SDLK_WORLD_83		= 243,
-	SDLK_WORLD_84		= 244,
-	SDLK_WORLD_85		= 245,
-	SDLK_WORLD_86		= 246,
-	SDLK_WORLD_87		= 247,
-	SDLK_WORLD_88		= 248,
-	SDLK_WORLD_89		= 249,
-	SDLK_WORLD_90		= 250,
-	SDLK_WORLD_91		= 251,
-	SDLK_WORLD_92		= 252,
-	SDLK_WORLD_93		= 253,
-	SDLK_WORLD_94		= 254,
-	SDLK_WORLD_95		= 255,		/* 0xFF */
-        /*@}*/
-
-	/** @name Numeric keypad */
-        /*@{*/
-	SDLK_KP0		= 256,
-	SDLK_KP1		= 257,
-	SDLK_KP2		= 258,
-	SDLK_KP3		= 259,
-	SDLK_KP4		= 260,
-	SDLK_KP5		= 261,
-	SDLK_KP6		= 262,
-	SDLK_KP7		= 263,
-	SDLK_KP8		= 264,
-	SDLK_KP9		= 265,
-	SDLK_KP_PERIOD		= 266,
-	SDLK_KP_DIVIDE		= 267,
-	SDLK_KP_MULTIPLY	= 268,
-	SDLK_KP_MINUS		= 269,
-	SDLK_KP_PLUS		= 270,
-	SDLK_KP_ENTER		= 271,
-	SDLK_KP_EQUALS		= 272,
-        /*@}*/
-
-	/** @name Arrows + Home/End pad */
-        /*@{*/
-	SDLK_UP			= 273,
-	SDLK_DOWN		= 274,
-	SDLK_RIGHT		= 275,
-	SDLK_LEFT		= 276,
-	SDLK_INSERT		= 277,
-	SDLK_HOME		= 278,
-	SDLK_END		= 279,
-	SDLK_PAGEUP		= 280,
-	SDLK_PAGEDOWN		= 281,
-        /*@}*/
-
-	/** @name Function keys */
-        /*@{*/
-	SDLK_F1			= 282,
-	SDLK_F2			= 283,
-	SDLK_F3			= 284,
-	SDLK_F4			= 285,
-	SDLK_F5			= 286,
-	SDLK_F6			= 287,
-	SDLK_F7			= 288,
-	SDLK_F8			= 289,
-	SDLK_F9			= 290,
-	SDLK_F10		= 291,
-	SDLK_F11		= 292,
-	SDLK_F12		= 293,
-	SDLK_F13		= 294,
-	SDLK_F14		= 295,
-	SDLK_F15		= 296,
-        /*@}*/
-
-	/** @name Key state modifier keys */
-        /*@{*/
-	SDLK_NUMLOCK		= 300,
-	SDLK_CAPSLOCK		= 301,
-	SDLK_SCROLLOCK		= 302,
-	SDLK_RSHIFT		= 303,
-	SDLK_LSHIFT		= 304,
-	SDLK_RCTRL		= 305,
-	SDLK_LCTRL		= 306,
-	SDLK_RALT		= 307,
-	SDLK_LALT		= 308,
-	SDLK_RMETA		= 309,
-	SDLK_LMETA		= 310,
-	SDLK_LSUPER		= 311,		/**< Left "Windows" key */
-	SDLK_RSUPER		= 312,		/**< Right "Windows" key */
-	SDLK_MODE		= 313,		/**< "Alt Gr" key */
-	SDLK_COMPOSE		= 314,		/**< Multi-key compose key */
-        /*@}*/
-
-	/** @name Miscellaneous function keys */
-        /*@{*/
-	SDLK_HELP		= 315,
-	SDLK_PRINT		= 316,
-	SDLK_SYSREQ		= 317,
-	SDLK_BREAK		= 318,
-	SDLK_MENU		= 319,
-	SDLK_POWER		= 320,		/**< Power Macintosh power key */
-	SDLK_EURO		= 321,		/**< Some european keyboards */
-	SDLK_UNDO		= 322,		/**< Atari keyboard has Undo */
-        /*@}*/
-
-	/* Add any other keys here */
-
-	SDLK_LAST
-} SDLKey;
-
-/** Enumeration of valid key mods (possibly OR'd together) */
-typedef enum {
-	KMOD_NONE  = 0x0000,
-	KMOD_LSHIFT= 0x0001,
-	KMOD_RSHIFT= 0x0002,
-	KMOD_LCTRL = 0x0040,
-	KMOD_RCTRL = 0x0080,
-	KMOD_LALT  = 0x0100,
-	KMOD_RALT  = 0x0200,
-	KMOD_LMETA = 0x0400,
-	KMOD_RMETA = 0x0800,
-	KMOD_NUM   = 0x1000,
-	KMOD_CAPS  = 0x2000,
-	KMOD_MODE  = 0x4000,
-	KMOD_RESERVED = 0x8000
-} SDLMod;
-
-#define KMOD_CTRL	(KMOD_LCTRL|KMOD_RCTRL)
-#define KMOD_SHIFT	(KMOD_LSHIFT|KMOD_RSHIFT)
-#define KMOD_ALT	(KMOD_LALT|KMOD_RALT)
-#define KMOD_META	(KMOD_LMETA|KMOD_RMETA)
-
-#endif /* _SDL_keysym_h */
diff --git a/src/audio/SDL_audiodev.c b/src/audio/SDL_audiodev.c
deleted file mode 100644
index 396156c..0000000
--- a/src/audio/SDL_audiodev.c
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/* Get the name of the audio device we use for output */
-
-#if SDL_AUDIO_DRIVER_BSD || SDL_AUDIO_DRIVER_OSS || SDL_AUDIO_DRIVER_SUNAUDIO
-
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "SDL_stdinc.h"
-#include "SDL_audiodev_c.h"
-
-#ifndef _PATH_DEV_DSP
-#if defined(__NETBSD__) || defined(__OPENBSD__)
-#define _PATH_DEV_DSP  "/dev/audio"
-#else
-#define _PATH_DEV_DSP  "/dev/dsp"
-#endif
-#endif
-#ifndef _PATH_DEV_DSP24
-#define _PATH_DEV_DSP24	"/dev/sound/dsp"
-#endif
-#ifndef _PATH_DEV_AUDIO
-#define _PATH_DEV_AUDIO	"/dev/audio"
-#endif
-
-
-int SDL_OpenAudioPath(char *path, int maxlen, int flags, int classic)
-{
-	const char *audiodev;
-	int audio_fd;
-	char audiopath[1024];
-
-	/* Figure out what our audio device is */
-	if ( ((audiodev=SDL_getenv("SDL_PATH_DSP")) == NULL) &&
-	     ((audiodev=SDL_getenv("AUDIODEV")) == NULL) ) {
-		if ( classic ) {
-			audiodev = _PATH_DEV_AUDIO;
-		} else {
-			struct stat sb;
-
-			/* Added support for /dev/sound/\* in Linux 2.4 */
-			if ( ((stat("/dev/sound", &sb) == 0) && S_ISDIR(sb.st_mode)) &&
-				 ((stat(_PATH_DEV_DSP24, &sb) == 0) && S_ISCHR(sb.st_mode)) ) {
-				audiodev = _PATH_DEV_DSP24;
-			} else {
-				audiodev = _PATH_DEV_DSP;
-			}
-		}
-	}
-	audio_fd = open(audiodev, flags, 0);
-
-	/* If the first open fails, look for other devices */
-	if ( (audio_fd < 0) && (SDL_strlen(audiodev) < (sizeof(audiopath)-3)) ) {
-		int exists, instance;
-		struct stat sb;
-
-		instance = 1;
-		do { /* Don't use errno ENOENT - it may not be thread-safe */
-			SDL_snprintf(audiopath, SDL_arraysize(audiopath),
-			             "%s%d", audiodev, instance++);
-			exists = 0;
-			if ( stat(audiopath, &sb) == 0 ) {
-				exists = 1;
-				audio_fd = open(audiopath, flags, 0); 
-			}
-		} while ( exists && (audio_fd < 0) );
-		audiodev = audiopath;
-	}
-	if ( path != NULL ) {
-		SDL_strlcpy(path, audiodev, maxlen);
-		path[maxlen-1] = '\0';
-	}
-	return(audio_fd);
-}
-
-#elif SDL_AUDIO_DRIVER_PAUD
-
-/* Get the name of the audio device we use for output */
-
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "SDL_stdinc.h"
-#include "SDL_audiodev_c.h"
-
-#ifndef _PATH_DEV_DSP
-#define _PATH_DEV_DSP	"/dev/%caud%c/%c"
-#endif
-
-char devsettings[][3] =
-{
-    { 'p', '0', '1' }, { 'p', '0', '2' }, { 'p', '0', '3' }, { 'p', '0', '4' },
-    { 'p', '1', '1' }, { 'p', '1', '2' }, { 'p', '1', '3' }, { 'p', '1', '4' },
-    { 'p', '2', '1' }, { 'p', '2', '2' }, { 'p', '2', '3' }, { 'p', '2', '4' },
-    { 'p', '3', '1' }, { 'p', '3', '2' }, { 'p', '3', '3' }, { 'p', '3', '4' },
-    { 'b', '0', '1' }, { 'b', '0', '2' }, { 'b', '0', '3' }, { 'b', '0', '4' },
-    { 'b', '1', '1' }, { 'b', '1', '2' }, { 'b', '1', '3' }, { 'b', '1', '4' },
-    { 'b', '2', '1' }, { 'b', '2', '2' }, { 'b', '2', '3' }, { 'b', '2', '4' },
-    { 'b', '3', '1' }, { 'b', '3', '2' }, { 'b', '3', '3' }, { 'b', '3', '4' },
-    { '\0', '\0', '\0' }
-};
-
-static int OpenUserDefinedDevice(char *path, int maxlen, int flags)
-{
-	const char *audiodev;
-	int  audio_fd;
-
-	/* Figure out what our audio device is */
-	if ((audiodev=SDL_getenv("SDL_PATH_DSP")) == NULL) {
-	    audiodev=SDL_getenv("AUDIODEV");
-	}
-	if ( audiodev == NULL ) {
-	    return -1;
-	}
-	audio_fd = open(audiodev, flags, 0);
-	if ( path != NULL ) {
-		SDL_strlcpy(path, audiodev, maxlen);
-		path[maxlen-1] = '\0';
-	}
-	return audio_fd;
-}
-
-int SDL_OpenAudioPath(char *path, int maxlen, int flags, int classic)
-{
-    struct stat sb;
-    int         audio_fd;
-    char        audiopath[1024];
-    int         cycle;
-
-    audio_fd = OpenUserDefinedDevice(path,maxlen,flags);
-    if ( audio_fd != -1 ) {
-        return audio_fd;
-    }
-
-    cycle    = 0;
-    while( devsettings[cycle][0] != '\0' ) {
-        SDL_snprintf( audiopath, SDL_arraysize(audiopath),
-                 _PATH_DEV_DSP,
-                 devsettings[cycle][0],
-                 devsettings[cycle][1],
-                 devsettings[cycle][2]);
-
-	if ( stat(audiopath, &sb) == 0 ) {
-	    audio_fd = open(audiopath, flags, 0);
-	    if ( audio_fd > 0 ) {
-		if ( path != NULL ) {
-		    SDL_strlcpy( path, audiopath, maxlen );
-		}
-	        return audio_fd;
-	    }
-	}
-    }
-    return -1;
-}
-
-#endif /* Audio driver selection */
diff --git a/src/audio/SDL_audiodev.o b/src/audio/SDL_audiodev.o
deleted file mode 100644
index 0d30593..0000000
Binary files a/src/audio/SDL_audiodev.o and /dev/null differ
diff --git a/src/audio/SDL_audiodev_c.h b/src/audio/SDL_audiodev_c.h
deleted file mode 100644
index 179d0d0..0000000
--- a/src/audio/SDL_audiodev_c.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/* Open the audio device, storing the pathname in 'path'  */
-extern int SDL_OpenAudioPath(char *path, int maxlen, int flags, int classic);
-
diff --git a/src/audio/SDL_mixer.c b/src/audio/SDL_mixer.c
deleted file mode 100644
index b5d4d8b..0000000
--- a/src/audio/SDL_mixer.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/* This provides the default mixing callback for the SDL audio routines */
-
-#include "SDL_cpuinfo.h"
-#include "SDL_timer.h"
-#include "SDL_audio.h"
-#include "SDL_sysaudio.h"
-#include "SDL_mixer_MMX.h"
-#include "SDL_mixer_MMX_VC.h"
-#include "SDL_mixer_m68k.h"
-
-/* This table is used to add two sound values together and pin
- * the value to avoid overflow.  (used with permission from ARDI)
- * Changed to use 0xFE instead of 0xFF for better sound quality.
- */
-static const Uint8 mix8[] =
-{
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03,
-  0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
-  0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
-  0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24,
-  0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
-  0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A,
-  0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45,
-  0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50,
-  0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B,
-  0x5C, 0x5D, 0x5E, 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66,
-  0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
-  0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C,
-  0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-  0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x91, 0x92,
-  0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
-  0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8,
-  0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
-  0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE,
-  0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9,
-  0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4,
-  0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
-  0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA,
-  0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5,
-  0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE,
-  0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE
-};
-
-/* The volume ranges from 0 - 128 */
-#define ADJUST_VOLUME(s, v)	(s = (s*v)/SDL_MIX_MAXVOLUME)
-#define ADJUST_VOLUME_U8(s, v)	(s = (((s-128)*v)/SDL_MIX_MAXVOLUME)+128)
-
-void SDL_MixAudio (Uint8 *dst, const Uint8 *src, Uint32 len, int volume)
-{
-	Uint16 format;
-
-	if ( volume == 0 ) {
-		return;
-	}
-	/* Mix the user-level audio format */
-	if ( current_audio ) {
-		if ( current_audio->convert.needed ) {
-			format = current_audio->convert.src_format;
-		} else {
-			format = current_audio->spec.format;
-		}
-	} else {
-  		/* HACK HACK HACK */
-		format = AUDIO_S16;
-	}
-	switch (format) {
-
-		case AUDIO_U8: {
-#if defined(__GNUC__) && (defined(__m68k__) && !defined(__mcoldfire__)) && defined(SDL_ASSEMBLY_ROUTINES)
-			SDL_MixAudio_m68k_U8((char*)dst,(char*)src,(unsigned long)len,(long)volume,(char *)mix8);
-#else
-			Uint8 src_sample;
-
-			while ( len-- ) {
-				src_sample = *src;
-				ADJUST_VOLUME_U8(src_sample, volume);
-				*dst = mix8[*dst+src_sample];
-				++dst;
-				++src;
-			}
-#endif
-		}
-		break;
-
-		case AUDIO_S8: {
-#if defined(SDL_BUGGY_MMX_MIXERS) /* buggy, so we're disabling them. --ryan. */
-#if defined(__GNUC__) && defined(__i386__) && defined(SDL_ASSEMBLY_ROUTINES)
-			if (SDL_HasMMX())
-			{
-				SDL_MixAudio_MMX_S8((char*)dst,(char*)src,(unsigned int)len,(int)volume);
-			}
-			else
-#elif ((defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)) && defined(SDL_ASSEMBLY_ROUTINES)
-			if (SDL_HasMMX())
-			{
-				SDL_MixAudio_MMX_S8_VC((char*)dst,(char*)src,(unsigned int)len,(int)volume);
-			}
-			else
-#endif
-#endif
-
-#if defined(__GNUC__) && (defined(__m68k__) && !defined(__mcoldfire__)) && defined(SDL_ASSEMBLY_ROUTINES)
-			SDL_MixAudio_m68k_S8((char*)dst,(char*)src,(unsigned long)len,(long)volume);
-#else
-			{
-			Sint8 *dst8, *src8;
-			Sint8 src_sample;
-			int dst_sample;
-			const int max_audioval = ((1<<(8-1))-1);
-			const int min_audioval = -(1<<(8-1));
-
-			src8 = (Sint8 *)src;
-			dst8 = (Sint8 *)dst;
-			while ( len-- ) {
-				src_sample = *src8;
-				ADJUST_VOLUME(src_sample, volume);
-				dst_sample = *dst8 + src_sample;
-				if ( dst_sample > max_audioval ) {
-					*dst8 = max_audioval;
-				} else
-				if ( dst_sample < min_audioval ) {
-					*dst8 = min_audioval;
-				} else {
-					*dst8 = dst_sample;
-				}
-				++dst8;
-				++src8;
-			}
-			}
-#endif
-		}
-		break;
-
-		case AUDIO_S16LSB: {
-#if defined(SDL_BUGGY_MMX_MIXERS) /* buggy, so we're disabling them. --ryan. */
-#if defined(__GNUC__) && defined(__i386__) && defined(SDL_ASSEMBLY_ROUTINES)
-			if (SDL_HasMMX())
-			{
-				SDL_MixAudio_MMX_S16((char*)dst,(char*)src,(unsigned int)len,(int)volume);
-			}
-                        else
-#elif ((defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)) && defined(SDL_ASSEMBLY_ROUTINES)
-			if (SDL_HasMMX())
-			{
-				SDL_MixAudio_MMX_S16_VC((char*)dst,(char*)src,(unsigned int)len,(int)volume);
-			}
-			else
-#endif
-#endif
-
-#if defined(__GNUC__) && (defined(__m68k__) && !defined(__mcoldfire__)) && defined(SDL_ASSEMBLY_ROUTINES)
-			SDL_MixAudio_m68k_S16LSB((short*)dst,(short*)src,(unsigned long)len,(long)volume);
-#else
-			{
-			Sint16 src1, src2;
-			int dst_sample;
-			const int max_audioval = ((1<<(16-1))-1);
-			const int min_audioval = -(1<<(16-1));
-
-			len /= 2;
-			while ( len-- ) {
-				src1 = ((src[1])<<8|src[0]);
-				ADJUST_VOLUME(src1, volume);
-				src2 = ((dst[1])<<8|dst[0]);
-				src += 2;
-				dst_sample = src1+src2;
-				if ( dst_sample > max_audioval ) {
-					dst_sample = max_audioval;
-				} else
-				if ( dst_sample < min_audioval ) {
-					dst_sample = min_audioval;
-				}
-				dst[0] = dst_sample&0xFF;
-				dst_sample >>= 8;
-				dst[1] = dst_sample&0xFF;
-				dst += 2;
-			}
-			}
-#endif
-		}
-		break;
-
-		case AUDIO_S16MSB: {
-#if defined(__GNUC__) && (defined(__m68k__) && !defined(__mcoldfire__)) && defined(SDL_ASSEMBLY_ROUTINES)
-			SDL_MixAudio_m68k_S16MSB((short*)dst,(short*)src,(unsigned long)len,(long)volume);
-#else
-			Sint16 src1, src2;
-			int dst_sample;
-			const int max_audioval = ((1<<(16-1))-1);
-			const int min_audioval = -(1<<(16-1));
-
-			len /= 2;
-			while ( len-- ) {
-				src1 = ((src[0])<<8|src[1]);
-				ADJUST_VOLUME(src1, volume);
-				src2 = ((dst[0])<<8|dst[1]);
-				src += 2;
-				dst_sample = src1+src2;
-				if ( dst_sample > max_audioval ) {
-					dst_sample = max_audioval;
-				} else
-				if ( dst_sample < min_audioval ) {
-					dst_sample = min_audioval;
-				}
-				dst[1] = dst_sample&0xFF;
-				dst_sample >>= 8;
-				dst[0] = dst_sample&0xFF;
-				dst += 2;
-			}
-#endif
-		}
-		break;
-
-		default: /* If this happens... FIXME! */
-			SDL_SetError("SDL_MixAudio(): unknown audio format");
-			return;
-	}
-}
-
diff --git a/src/audio/SDL_mixer.o b/src/audio/SDL_mixer.o
deleted file mode 100644
index 98939e4..0000000
Binary files a/src/audio/SDL_mixer.o and /dev/null differ
diff --git a/src/audio/SDL_mixer_MMX.c b/src/audio/SDL_mixer_MMX.c
deleted file mode 100644
index a2f1d8d..0000000
--- a/src/audio/SDL_mixer_MMX.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/*
-    MMX assembler version of SDL_MixAudio for signed little endian 16 bit samples and signed 8 bit samples
-    Copyright 2002 Stephane Marchesin (stephane.marchesin@wanadoo.fr)
-    This code is licensed under the LGPL (see COPYING for details)
- 
-    Assumes buffer size in bytes is a multiple of 16
-    Assumes SDL_MIX_MAXVOLUME = 128
-*/
-
-
-/***********************************************
-*   Mixing for 16 bit signed buffers
-***********************************************/
-
-#if defined(SDL_BUGGY_MMX_MIXERS) /* buggy, so we're disabling them. --ryan. */
-#if defined(__GNUC__) && defined(__i386__) && defined(SDL_ASSEMBLY_ROUTINES)
-void SDL_MixAudio_MMX_S16(char* dst,char* src,unsigned int size,int volume)
-{
-    __asm__ __volatile__ (
-
-"	movl %3,%%eax\n"	/* eax = volume */
-
-"	movl %2,%%edx\n"	/* edx = size */
-
-"	shrl $4,%%edx\n"	/* process 16 bytes per iteration = 8 samples */
-
-"	jz .endS16\n"
-
-"	pxor %%mm0,%%mm0\n"
-
-"	movd %%eax,%%mm0\n"
-"	movq %%mm0,%%mm1\n"
-"	psllq $16,%%mm0\n"
-"	por %%mm1,%%mm0\n"
-"	psllq $16,%%mm0\n"
-"	por %%mm1,%%mm0\n"
-"	psllq $16,%%mm0\n"
-"	por %%mm1,%%mm0\n"		/* mm0 = vol|vol|vol|vol */
-
-".align 8\n"
-"	.mixloopS16:\n"
-
-"	movq (%1),%%mm1\n" /* mm1 = a|b|c|d */
-
-"	movq %%mm1,%%mm2\n" /* mm2 = a|b|c|d */
-
-"	movq 8(%1),%%mm4\n" /* mm4 = e|f|g|h */
-
-	/* pr� charger le buffer dst dans mm7 */
-"	movq (%0),%%mm7\n" /* mm7 = dst[0] */
-
-	/* multiplier par le volume */
-"	pmullw %%mm0,%%mm1\n" /* mm1 = l(a*v)|l(b*v)|l(c*v)|l(d*v) */
-
-"	pmulhw %%mm0,%%mm2\n" /* mm2 = h(a*v)|h(b*v)|h(c*v)|h(d*v) */
-"	movq %%mm4,%%mm5\n" /* mm5 = e|f|g|h */
-
-"	pmullw %%mm0,%%mm4\n" /* mm4 = l(e*v)|l(f*v)|l(g*v)|l(h*v) */
-
-"	pmulhw %%mm0,%%mm5\n" /* mm5 = h(e*v)|h(f*v)|h(g*v)|h(h*v) */
-"	movq %%mm1,%%mm3\n" /* mm3 = l(a*v)|l(b*v)|l(c*v)|l(d*v) */
-
-"	punpckhwd %%mm2,%%mm1\n" /* mm1 = a*v|b*v */
-
-"	movq %%mm4,%%mm6\n" /* mm6 = l(e*v)|l(f*v)|l(g*v)|l(h*v) */
-"	punpcklwd %%mm2,%%mm3\n" /* mm3 = c*v|d*v */
-
-"	punpckhwd %%mm5,%%mm4\n" /* mm4 = e*f|f*v */
-
-"	punpcklwd %%mm5,%%mm6\n" /* mm6 = g*v|h*v */
-
-	/* pr� charger le buffer dst dans mm5 */
-"	movq 8(%0),%%mm5\n" /* mm5 = dst[1] */
-
-	/* diviser par 128 */
-"	psrad $7,%%mm1\n" /* mm1 = a*v/128|b*v/128 , 128 = SDL_MIX_MAXVOLUME */
-"	add $16,%1\n"
-
-"	psrad $7,%%mm3\n" /* mm3 = c*v/128|d*v/128 */
-
-"	psrad $7,%%mm4\n" /* mm4 = e*v/128|f*v/128 */
-
-	/* mm1 = le sample avec le volume modifi� */
-"	packssdw %%mm1,%%mm3\n" /* mm3 = s(a*v|b*v|c*v|d*v) */
-
-"	psrad $7,%%mm6\n" /* mm6= g*v/128|h*v/128 */
-"	paddsw %%mm7,%%mm3\n" /* mm3 = adjust_volume(src)+dst */
-
-	/* mm4 = le sample avec le volume modifi� */
-"	packssdw %%mm4,%%mm6\n" /* mm6 = s(e*v|f*v|g*v|h*v) */
-"	movq %%mm3,(%0)\n"
-
-"	paddsw %%mm5,%%mm6\n" /* mm6 = adjust_volume(src)+dst */
-
-"	movq %%mm6,8(%0)\n"
-
-"	add $16,%0\n"
-
-"	dec %%edx\n"
-
-"	jnz .mixloopS16\n"
-
-"	emms\n"
-
-".endS16:\n"
-	 :
-	 : "r" (dst), "r"(src),"m"(size),
-	 "m"(volume)
-	 : "eax","edx","memory"
-	 );
-}
-
-
-
-/*////////////////////////////////////////////// */
-/* Mixing for 8 bit signed buffers */
-/*////////////////////////////////////////////// */
-
-void SDL_MixAudio_MMX_S8(char* dst,char* src,unsigned int size,int volume)
-{
-    __asm__ __volatile__ (
-
-"	movl %3,%%eax\n"	/* eax = volume */
-
-"	movd %%eax,%%mm0\n"
-"	movq %%mm0,%%mm1\n"
-"	psllq $16,%%mm0\n"
-"	por %%mm1,%%mm0\n"
-"	psllq $16,%%mm0\n"
-"	por %%mm1,%%mm0\n"
-"	psllq $16,%%mm0\n"
-"	por %%mm1,%%mm0\n"
-
-"	movl %2,%%edx\n"	/* edx = size */
-"	shr $3,%%edx\n"	/* process 8 bytes per iteration = 8 samples */
-
-"	cmp $0,%%edx\n"
-"	je .endS8\n"
-
-".align 8\n"
-"	.mixloopS8:\n"
-
-"	pxor %%mm2,%%mm2\n"		/* mm2 = 0 */
-"	movq (%1),%%mm1\n"	/* mm1 = a|b|c|d|e|f|g|h */
-
-"	movq %%mm1,%%mm3\n" 	/* mm3 = a|b|c|d|e|f|g|h */
-
-	/* on va faire le "sign extension" en faisant un cmp avec 0 qui retourne 1 si <0, 0 si >0 */
-"	pcmpgtb %%mm1,%%mm2\n"	/* mm2 = 11111111|00000000|00000000.... */
-
-"	punpckhbw %%mm2,%%mm1\n"	/* mm1 = 0|a|0|b|0|c|0|d */
-
-"	punpcklbw %%mm2,%%mm3\n"	/* mm3 = 0|e|0|f|0|g|0|h */
-"	movq (%0),%%mm2\n"	/* mm2 = destination */
-
-"	pmullw %%mm0,%%mm1\n"	/* mm1 = v*a|v*b|v*c|v*d */
-"	add $8,%1\n"
-
-"	pmullw %%mm0,%%mm3\n"	/* mm3 = v*e|v*f|v*g|v*h */
-"	psraw $7,%%mm1\n"		/* mm1 = v*a/128|v*b/128|v*c/128|v*d/128  */
-
-"	psraw $7,%%mm3\n"		/* mm3 = v*e/128|v*f/128|v*g/128|v*h/128 */
-
-"	packsswb %%mm1,%%mm3\n"	/* mm1 = v*a/128|v*b/128|v*c/128|v*d/128|v*e/128|v*f/128|v*g/128|v*h/128 */
-
-"	paddsb %%mm2,%%mm3\n"	/* add to destination buffer */
-
-"	movq %%mm3,(%0)\n"	/* store back to ram */
-"	add $8,%0\n"
-
-"	dec %%edx\n"
-
-"	jnz .mixloopS8\n"
-
-".endS8:\n"
-"	emms\n"
-	 :
-	 : "r" (dst), "r"(src),"m"(size),
-	 "m"(volume)
-	 : "eax","edx","memory"
-	 );
-}
-#endif
-#endif
diff --git a/src/audio/SDL_mixer_MMX.h b/src/audio/SDL_mixer_MMX.h
deleted file mode 100644
index 836b259..0000000
--- a/src/audio/SDL_mixer_MMX.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
-    headers for MMX assembler version of SDL_MixAudio
-    Copyright 2002 Stephane Marchesin (stephane.marchesin@wanadoo.fr)
-    This code is licensed under the LGPL (see COPYING for details)
-
-    Assumes buffer size in bytes is a multiple of 16
-    Assumes SDL_MIX_MAXVOLUME = 128
-*/
-#include "SDL_config.h"
-
-#if defined(SDL_BUGGY_MMX_MIXERS) /* buggy, so we're disabling them. --ryan. */
-#if defined(__GNUC__) && defined(__i386__) && defined(SDL_ASSEMBLY_ROUTINES)
-void SDL_MixAudio_MMX_S16(char* ,char* ,unsigned int ,int );
-void SDL_MixAudio_MMX_S8(char* ,char* ,unsigned int ,int );
-#endif
-#endif
-
diff --git a/src/audio/SDL_mixer_MMX.o b/src/audio/SDL_mixer_MMX.o
deleted file mode 100644
index 84d64e3..0000000
Binary files a/src/audio/SDL_mixer_MMX.o and /dev/null differ
diff --git a/src/audio/SDL_mixer_MMX_VC.c b/src/audio/SDL_mixer_MMX_VC.c
deleted file mode 100644
index cd0aa4c..0000000
--- a/src/audio/SDL_mixer_MMX_VC.c
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-#include "SDL_mixer_MMX_VC.h"
-
-#if defined(SDL_BUGGY_MMX_MIXERS) /* buggy, so we're disabling them. --ryan. */
-#if ((defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)) && defined(SDL_ASSEMBLY_ROUTINES)
-// MMX assembler version of SDL_MixAudio for signed little endian 16 bit samples and signed 8 bit samples
-// Copyright 2002 Stephane Marchesin (stephane.marchesin@wanadoo.fr)
-// Converted to Intel ASM notation by Cth
-// This code is licensed under the LGPL (see COPYING for details)
-// 
-// Assumes buffer size in bytes is a multiple of 16
-// Assumes SDL_MIX_MAXVOLUME = 128
-
-
-////////////////////////////////////////////////
-// Mixing for 16 bit signed buffers
-////////////////////////////////////////////////
-
-void SDL_MixAudio_MMX_S16_VC(char* dst,char* src,unsigned int nSize,int volume)
-{
-	__asm
-	{
-
-		push	edi
-		push	esi
-		push	ebx
-		
-		mov		edi, dst		// edi = dst
-		mov		esi, src		// esi = src
-		mov		eax, volume		// eax = volume
-		mov		ebx, nSize		// ebx = size
-		shr		ebx, 4			// process 16 bytes per iteration = 8 samples
-		jz		endS16
-		
-		pxor	mm0, mm0
-		movd	mm0, eax		//%%eax,%%mm0
-		movq	mm1, mm0		//%%mm0,%%mm1
-		psllq	mm0, 16			//$16,%%mm0
-		por		mm0, mm1		//%%mm1,%%mm0
-		psllq	mm0, 16			//$16,%%mm0
-		por		mm0, mm1		//%%mm1,%%mm0
-		psllq	mm0, 16			//$16,%%mm0
-		por		mm0, mm1		//%%mm1,%%mm0			// mm0 = vol|vol|vol|vol
-
-		#ifndef __WATCOMC__
-		align	16
-		#endif
-mixloopS16:
-		movq	mm1, [esi]		//(%%esi),%%mm1\n" // mm1 = a|b|c|d
-		movq	mm2, mm1		//%%mm1,%%mm2\n" // mm2 = a|b|c|d
-		movq	mm4, [esi + 8]	//8(%%esi),%%mm4\n" // mm4 = e|f|g|h
-		// pre charger le buffer dst dans mm7
-		movq	mm7, [edi]		//(%%edi),%%mm7\n" // mm7 = dst[0]"
-		// multiplier par le volume
-		pmullw	mm1, mm0		//%%mm0,%%mm1\n" // mm1 = l(a*v)|l(b*v)|l(c*v)|l(d*v)
-		pmulhw	mm2, mm0		//%%mm0,%%mm2\n" // mm2 = h(a*v)|h(b*v)|h(c*v)|h(d*v)
-		movq	mm5, mm4		//%%mm4,%%mm5\n" // mm5 = e|f|g|h
-		pmullw	mm4, mm0		//%%mm0,%%mm4\n" // mm4 = l(e*v)|l(f*v)|l(g*v)|l(h*v)
-		pmulhw	mm5, mm0		//%%mm0,%%mm5\n" // mm5 = h(e*v)|h(f*v)|h(g*v)|h(h*v)
-		movq	mm3, mm1		//%%mm1,%%mm3\n" // mm3 = l(a*v)|l(b*v)|l(c*v)|l(d*v)
-		punpckhwd	mm1, mm2	//%%mm2,%%mm1\n" // mm1 = a*v|b*v
-		movq		mm6, mm4	//%%mm4,%%mm6\n" // mm6 = l(e*v)|l(f*v)|l(g*v)|l(h*v)
-		punpcklwd	mm3, mm2	//%%mm2,%%mm3\n" // mm3 = c*v|d*v
-		punpckhwd	mm4, mm5	//%%mm5,%%mm4\n" // mm4 = e*f|f*v
-		punpcklwd	mm6, mm5	//%%mm5,%%mm6\n" // mm6 = g*v|h*v
-		// pre charger le buffer dst dans mm5
-		movq	mm5, [edi + 8]	//8(%%edi),%%mm5\n" // mm5 = dst[1]
-		// diviser par 128
-		psrad	mm1, 7			//$7,%%mm1\n" // mm1 = a*v/128|b*v/128 , 128 = SDL_MIX_MAXVOLUME
-		add		esi, 16			//$16,%%esi\n"
-		psrad	mm3, 7			//$7,%%mm3\n" // mm3 = c*v/128|d*v/128
-		psrad	mm4, 7			//$7,%%mm4\n" // mm4 = e*v/128|f*v/128
-		// mm1 = le sample avec le volume modifie
-		packssdw	mm3, mm1	//%%mm1,%%mm3\n" // mm3 = s(a*v|b*v|c*v|d*v)
-		psrad	mm6, 7			//$7,%%mm6\n" // mm6= g*v/128|h*v/128
-		paddsw	mm3, mm7		//%%mm7,%%mm3\n" // mm3 = adjust_volume(src)+dst
-		// mm4 = le sample avec le volume modifie
-		packssdw	mm6, mm4	//%%mm4,%%mm6\n" // mm6 = s(e*v|f*v|g*v|h*v)
-		movq	[edi], mm3		//%%mm3,(%%edi)\n"
-		paddsw	mm6, mm5		//%%mm5,%%mm6\n" // mm6 = adjust_volume(src)+dst
-		movq	[edi + 8], mm6	//%%mm6,8(%%edi)\n"
-		add		edi, 16			//$16,%%edi\n"
-		dec		ebx				//%%ebx\n"
-		jnz mixloopS16
-
-endS16:
-		emms
-		
-		pop		ebx
-		pop		esi
-		pop		edi
-	}
-
-}
-
-////////////////////////////////////////////////
-// Mixing for 8 bit signed buffers
-////////////////////////////////////////////////
-
-void SDL_MixAudio_MMX_S8_VC(char* dst,char* src,unsigned int nSize,int volume)
-{
-	_asm
-	{
-
-		push	edi
-		push	esi
-		push	ebx
-		
-		mov		edi, dst	//movl	%0,%%edi	// edi = dst
-		mov		esi, src	//%1,%%esi	// esi = src
-		mov		eax, volume	//%3,%%eax	// eax = volume
-
-		movd	mm0, eax	//%%eax,%%mm0
-		movq	mm1, mm0	//%%mm0,%%mm1
-		psllq	mm0, 16		//$16,%%mm0
-		por		mm0, mm1	//%%mm1,%%mm0
-		psllq	mm0, 16		//$16,%%mm0
-		por		mm0, mm1	//%%mm1,%%mm0
-		psllq	mm0, 16		//$16,%%mm0
-		por		mm0, mm1	//%%mm1,%%mm0
-		
-		mov		ebx, nSize	//%2,%%ebx	// ebx = size
-		shr		ebx, 3		//$3,%%ebx	// process 8 bytes per iteration = 8 samples
-		cmp		ebx, 0		//$0,%%ebx
-		je		endS8
-
-		#ifndef __WATCOMC__
-		align 16
-		#endif
-mixloopS8:
-		pxor	mm2, mm2	//%%mm2,%%mm2		// mm2 = 0
-		movq	mm1, [esi]	//(%%esi),%%mm1	// mm1 = a|b|c|d|e|f|g|h
-		movq	mm3, mm1	//%%mm1,%%mm3 	// mm3 = a|b|c|d|e|f|g|h
-		// on va faire le "sign extension" en faisant un cmp avec 0 qui retourne 1 si <0, 0 si >0
-		pcmpgtb		mm2, mm1	//%%mm1,%%mm2	// mm2 = 11111111|00000000|00000000....
-		punpckhbw	mm1, mm2	//%%mm2,%%mm1	// mm1 = 0|a|0|b|0|c|0|d
-		punpcklbw	mm3, mm2	//%%mm2,%%mm3	// mm3 = 0|e|0|f|0|g|0|h
-		movq	mm2, [edi]	//(%%edi),%%mm2	// mm2 = destination
-		pmullw	mm1, mm0	//%%mm0,%%mm1	// mm1 = v*a|v*b|v*c|v*d
-		add		esi, 8		//$8,%%esi
-		pmullw	mm3, mm0	//%%mm0,%%mm3	// mm3 = v*e|v*f|v*g|v*h
-		psraw	mm1, 7		//$7,%%mm1		// mm1 = v*a/128|v*b/128|v*c/128|v*d/128 
-		psraw	mm3, 7		//$7,%%mm3		// mm3 = v*e/128|v*f/128|v*g/128|v*h/128
-		packsswb mm3, mm1	//%%mm1,%%mm3	// mm1 = v*a/128|v*b/128|v*c/128|v*d/128|v*e/128|v*f/128|v*g/128|v*h/128
-		paddsb	mm3, mm2	//%%mm2,%%mm3	// add to destination buffer
-		movq	[edi], mm3	//%%mm3,(%%edi)	// store back to ram
-		add		edi, 8		//$8,%%edi
-		dec		ebx			//%%ebx
-		jnz		mixloopS8
-		
-endS8:
-		emms
-		
-		pop		ebx
-		pop		esi
-		pop		edi
-	}
-}
-
-#endif /* SDL_ASSEMBLY_ROUTINES */
-#endif /* SDL_BUGGY_MMX_MIXERS */
diff --git a/src/audio/SDL_mixer_MMX_VC.h b/src/audio/SDL_mixer_MMX_VC.h
deleted file mode 100644
index fbd0eb8..0000000
--- a/src/audio/SDL_mixer_MMX_VC.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-
-#if defined(SDL_BUGGY_MMX_MIXERS) /* buggy, so we're disabling them. --ryan. */
-#if ((defined(_MSC_VER) && defined(_M_IX86)) || defined(__WATCOMC__)) && defined(SDL_ASSEMBLY_ROUTINES)
-/* headers for MMX assembler version of SDL_MixAudio
-   Copyright 2002 Stephane Marchesin (stephane.marchesin@wanadoo.fr)
-   Converted to Intel ASM notation by Cth
-   This code is licensed under the LGPL (see COPYING for details)
-   
-   Assumes buffer size in bytes is a multiple of 16
-   Assumes SDL_MIX_MAXVOLUME = 128
-*/
-void SDL_MixAudio_MMX_S16_VC(char* ,char* ,unsigned int ,int );
-void SDL_MixAudio_MMX_S8_VC(char* ,char* ,unsigned int ,int );
-#endif
-#endif
diff --git a/src/audio/SDL_mixer_MMX_VC.o b/src/audio/SDL_mixer_MMX_VC.o
deleted file mode 100644
index f370377..0000000
Binary files a/src/audio/SDL_mixer_MMX_VC.o and /dev/null differ
diff --git a/src/audio/SDL_mixer_m68k.c b/src/audio/SDL_mixer_m68k.c
deleted file mode 100644
index 22bb1cb..0000000
--- a/src/audio/SDL_mixer_m68k.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Library General Public
-    License as published by the Free Software Foundation; either
-    version 2 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Library General Public License for more details.
-
-    You should have received a copy of the GNU Library General Public
-    License along with this library; if not, write to the Free
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/*
-	m68k assembly mix routines
-
-	Patrice Mandin
-*/
-
-#if (defined(__m68k__) && !defined(__mcoldfire__)) && defined(__GNUC__)
-void SDL_MixAudio_m68k_U8(char* dst, char* src, long len, long volume, char* mix8)
-{
-    __asm__ __volatile__ (
-
-	"tstl	%2\n"
-"	beqs	stoploop_u8\n"
-"mixloop_u8:\n"
-
-	/* Mix a sample */
-
-"	moveq	#0,%%d0\n"
-"	moveq	#0,%%d1\n"
-
-"	moveb	%1@+,%%d0\n"	/* d0 = *src++ */
-"	sub	#128,%%d0\n"	/* d0 -= 128 */
-"	muls	%3,%%d0\n"	/* d0 *= volume (0<=volume<=128) */
-"	moveb	%0@,%%d1\n"	/* d1 = *dst */
-"	asr	#7,%%d0\n"	/* d0 /= 128 (SDL_MIX_MAXVOLUME) */
-"	add	#128,%%d0\n"	/* d0 += 128 */
-
-"	add	%%d1,%%d0\n"
-
-"	moveb	%4@(%%d0:w),%0@+\n"
-
-	/* Loop till done */
-
-"	subql	#1,%2\n"
-"	bhis	mixloop_u8\n"
-"stoploop_u8:\n"
-
-	 : /* no return value */
-	 : /* input */
-	 	"a"(dst), "a"(src), "d"(len), "d"(volume), "a"(mix8)	
-	 : /* clobbered registers */
-	 	"d0", "d1", "cc", "memory" 
-	 );
-}
-
-void SDL_MixAudio_m68k_S8(char* dst, char* src, long len, long volume)
-{
-    __asm__ __volatile__ (
-
-	"tstl	%2\n"
-"	beqs	stoploop_s8\n"
-"	moveq	#-128,%%d2\n"
-"	moveq	#127,%%d3\n"
-"mixloop_s8:\n"
-
-	/* Mix a sample */
-
-"	moveq	#0,%%d0\n"
-"	moveq	#0,%%d1\n"
-
-"	moveb	%1@+,%%d0\n"	/* d0 = *src++ */
-"	muls	%3,%%d0\n"	/* d0 *= volume (0<=volume<=128) */
-"	moveb	%0@,%%d1\n"	/* d1 = *dst */
-"	asr	#7,%%d0\n"	/* d0 /= 128 (SDL_MIX_MAXVOLUME) */
-
-"	add	%%d1,%%d0\n"
-
-"	cmp	%%d2,%%d0\n"
-"	bges	lower_limit_s8\n"
-"	move	%%d2,%%d0\n"
-"lower_limit_s8:\n"
-
-"	cmp	%%d3,%%d0\n"
-"	bles	upper_limit_s8\n"
-"	move	%%d3,%%d0\n"
-"upper_limit_s8:\n"
-"	moveb	%%d0,%0@+\n"
-
-	/* Loop till done */
-
-"	subql	#1,%2\n"
-"	bhis	mixloop_s8\n"
-"stoploop_s8:\n"
-
-	 : /* no return value */
-	 : /* input */
-	 	"a"(dst), "a"(src), "d"(len), "d"(volume)	
-	 : /* clobbered registers */
-	 	"d0", "d1", "d2", "d3", "cc", "memory" 
-	 );
-}
-
-void SDL_MixAudio_m68k_S16MSB(short* dst, short* src, long len, long volume)
-{
-    __asm__ __volatile__ (
-
-	"tstl	%2\n"
-"	beqs	stoploop_s16msb\n"
-"	movel	#-32768,%%d2\n"
-"	movel	#32767,%%d3\n"
-"	lsrl	#1,%2\n"
-"mixloop_s16msb:\n"
-
-	/* Mix a sample */
-
-"	move	%1@+,%%d0\n"	/* d0 = *src++ */
-"	muls	%3,%%d0\n"	/* d0 *= volume (0<=volume<=128) */
-"	move	%0@,%%d1\n"	/* d1 = *dst */
-"	extl	%%d1\n"		/* extend d1 to 32 bits */
-"	asrl	#7,%%d0\n"	/* d0 /= 128 (SDL_MIX_MAXVOLUME) */
-
-"	addl	%%d1,%%d0\n"
-
-"	cmpl	%%d2,%%d0\n"
-"	bges	lower_limit_s16msb\n"
-"	move	%%d2,%%d0\n"
-"lower_limit_s16msb:\n"
-
-"	cmpl	%%d3,%%d0\n"
-"	bles	upper_limit_s16msb\n"
-"	move	%%d3,%%d0\n"
-"upper_limit_s16msb:\n"
-"	move	%%d0,%0@+\n"
-
-	/* Loop till done */
-
-"	subql	#1,%2\n"
-"	bhis	mixloop_s16msb\n"
-"stoploop_s16msb:\n"
-
-	 : /* no return value */
-	 : /* input */
-	 	"a"(dst), "a"(src), "d"(len), "d"(volume)	
-	 : /* clobbered registers */
-	 	"d0", "d1", "d2", "d3", "cc", "memory" 
-	 );
-}
-
-void SDL_MixAudio_m68k_S16LSB(short* dst, short* src, long len, long volume)
-{
-    __asm__ __volatile__ (
-
-	"tstl	%2\n"
-"	beqs	stoploop_s16lsb\n"
-"	movel	#-32768,%%d2\n"
-"	movel	#32767,%%d3\n"
-"	lsrl	#1,%2\n"
-"mixloop_s16lsb:\n"
-
-	/* Mix a sample */
-
-"	move	%1@+,%%d0\n"	/* d0 = *src++ */
-"	rorw	#8,%%d0\n"
-"	muls	%3,%%d0\n"	/* d0 *= volume (0<=volume<=128) */
-"	move	%0@,%%d1\n"	/* d1 = *dst */
-"	rorw	#8,%%d1\n"
-"	extl	%%d1\n"		/* extend d1 to 32 bits */
-"	asrl	#7,%%d0\n"	/* d0 /= 128 (SDL_MIX_MAXVOLUME) */
-
-"	addl	%%d1,%%d0\n"
-
-"	cmpl	%%d2,%%d0\n"
-"	bges	lower_limit_s16lsb\n"
-"	move	%%d2,%%d0\n"
-"lower_limit_s16lsb:\n"
-
-"	cmpl	%%d3,%%d0\n"
-"	bles	upper_limit_s16lsb\n"
-"	move	%%d3,%%d0\n"
-"upper_limit_s16lsb:\n"
-"	rorw	#8,%%d0\n"
-"	move	%%d0,%0@+\n"
-
-	/* Loop till done */
-
-"	subql	#1,%2\n"
-"	bhis	mixloop_s16lsb\n"
-"stoploop_s16lsb:\n"
-
-	 : /* no return value */
-	 : /* input */
-	 	"a"(dst), "a"(src), "d"(len), "d"(volume)	
-	 : /* clobbered registers */
-	 	"d0", "d1", "d2", "d3", "cc", "memory" 
-	 );
-}
-#endif
diff --git a/src/audio/SDL_mixer_m68k.h b/src/audio/SDL_mixer_m68k.h
deleted file mode 100644
index 673df00..0000000
--- a/src/audio/SDL_mixer_m68k.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Library General Public
-    License as published by the Free Software Foundation; either
-    version 2 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Library General Public License for more details.
-
-    You should have received a copy of the GNU Library General Public
-    License along with this library; if not, write to the Free
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/*
-	m68k assembly mix routines
-
-	Patrice Mandin
-*/
-
-#if (defined(__m68k__) && !defined(__mcoldfire__)) && defined(__GNUC__)
-void SDL_MixAudio_m68k_U8(char* dst,char* src, long len, long volume, char* mix8);
-void SDL_MixAudio_m68k_S8(char* dst,char* src, long len, long volume);
-
-void SDL_MixAudio_m68k_S16MSB(short* dst,short* src, long len, long volume);
-void SDL_MixAudio_m68k_S16LSB(short* dst,short* src, long len, long volume);
-#endif
diff --git a/src/audio/SDL_mixer_m68k.o b/src/audio/SDL_mixer_m68k.o
deleted file mode 100644
index 3c9c08e..0000000
Binary files a/src/audio/SDL_mixer_m68k.o and /dev/null differ
diff --git a/src/audio/SDL_wave.c b/src/audio/SDL_wave.c
deleted file mode 100644
index b4ad6c7..0000000
--- a/src/audio/SDL_wave.c
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/* Microsoft WAVE file loading routines */
-
-#include "SDL_audio.h"
-#include "SDL_wave.h"
-
-
-static int ReadChunk(SDL_RWops *src, Chunk *chunk);
-
-struct MS_ADPCM_decodestate {
-	Uint8 hPredictor;
-	Uint16 iDelta;
-	Sint16 iSamp1;
-	Sint16 iSamp2;
-};
-static struct MS_ADPCM_decoder {
-	WaveFMT wavefmt;
-	Uint16 wSamplesPerBlock;
-	Uint16 wNumCoef;
-	Sint16 aCoeff[7][2];
-	/* * * */
-	struct MS_ADPCM_decodestate state[2];
-} MS_ADPCM_state;
-
-static int InitMS_ADPCM(WaveFMT *format)
-{
-	Uint8 *rogue_feel;
-	int i;
-
-	/* Set the rogue pointer to the MS_ADPCM specific data */
-	MS_ADPCM_state.wavefmt.encoding = SDL_SwapLE16(format->encoding);
-	MS_ADPCM_state.wavefmt.channels = SDL_SwapLE16(format->channels);
-	MS_ADPCM_state.wavefmt.frequency = SDL_SwapLE32(format->frequency);
-	MS_ADPCM_state.wavefmt.byterate = SDL_SwapLE32(format->byterate);
-	MS_ADPCM_state.wavefmt.blockalign = SDL_SwapLE16(format->blockalign);
-	MS_ADPCM_state.wavefmt.bitspersample =
-					 SDL_SwapLE16(format->bitspersample);
-	rogue_feel = (Uint8 *)format+sizeof(*format);
-	if ( sizeof(*format) == 16 ) {
-		rogue_feel += sizeof(Uint16);
-	}
-	MS_ADPCM_state.wSamplesPerBlock = ((rogue_feel[1]<<8)|rogue_feel[0]);
-	rogue_feel += sizeof(Uint16);
-	MS_ADPCM_state.wNumCoef = ((rogue_feel[1]<<8)|rogue_feel[0]);
-	rogue_feel += sizeof(Uint16);
-	if ( MS_ADPCM_state.wNumCoef != 7 ) {
-		SDL_SetError("Unknown set of MS_ADPCM coefficients");
-		return(-1);
-	}
-	for ( i=0; i<MS_ADPCM_state.wNumCoef; ++i ) {
-		MS_ADPCM_state.aCoeff[i][0] = ((rogue_feel[1]<<8)|rogue_feel[0]);
-		rogue_feel += sizeof(Uint16);
-		MS_ADPCM_state.aCoeff[i][1] = ((rogue_feel[1]<<8)|rogue_feel[0]);
-		rogue_feel += sizeof(Uint16);
-	}
-	return(0);
-}
-
-static Sint32 MS_ADPCM_nibble(struct MS_ADPCM_decodestate *state,
-					Uint8 nybble, Sint16 *coeff)
-{
-	const Sint32 max_audioval = ((1<<(16-1))-1);
-	const Sint32 min_audioval = -(1<<(16-1));
-	const Sint32 adaptive[] = {
-		230, 230, 230, 230, 307, 409, 512, 614,
-		768, 614, 512, 409, 307, 230, 230, 230
-	};
-	Sint32 new_sample, delta;
-
-	new_sample = ((state->iSamp1 * coeff[0]) +
-		      (state->iSamp2 * coeff[1]))/256;
-	if ( nybble & 0x08 ) {
-		new_sample += state->iDelta * (nybble-0x10);
-	} else {
-		new_sample += state->iDelta * nybble;
-	}
-	if ( new_sample < min_audioval ) {
-		new_sample = min_audioval;
-	} else
-	if ( new_sample > max_audioval ) {
-		new_sample = max_audioval;
-	}
-	delta = ((Sint32)state->iDelta * adaptive[nybble])/256;
-	if ( delta < 16 ) {
-		delta = 16;
-	}
-	state->iDelta = (Uint16)delta;
-	state->iSamp2 = state->iSamp1;
-	state->iSamp1 = (Sint16)new_sample;
-	return(new_sample);
-}
-
-static int MS_ADPCM_decode(Uint8 **audio_buf, Uint32 *audio_len)
-{
-	struct MS_ADPCM_decodestate *state[2];
-	Uint8 *freeable, *encoded, *decoded;
-	Sint32 encoded_len, samplesleft;
-	Sint8 nybble, stereo;
-	Sint16 *coeff[2];
-	Sint32 new_sample;
-
-	/* Allocate the proper sized output buffer */
-	encoded_len = *audio_len;
-	encoded = *audio_buf;
-	freeable = *audio_buf;
-	*audio_len = (encoded_len/MS_ADPCM_state.wavefmt.blockalign) * 
-				MS_ADPCM_state.wSamplesPerBlock*
-				MS_ADPCM_state.wavefmt.channels*sizeof(Sint16);
-	*audio_buf = (Uint8 *)SDL_malloc(*audio_len);
-	if ( *audio_buf == NULL ) {
-		SDL_Error(SDL_ENOMEM);
-		return(-1);
-	}
-	decoded = *audio_buf;
-
-	/* Get ready... Go! */
-	stereo = (MS_ADPCM_state.wavefmt.channels == 2);
-	state[0] = &MS_ADPCM_state.state[0];
-	state[1] = &MS_ADPCM_state.state[stereo];
-	while ( encoded_len >= MS_ADPCM_state.wavefmt.blockalign ) {
-		/* Grab the initial information for this block */
-		state[0]->hPredictor = *encoded++;
-		if ( stereo ) {
-			state[1]->hPredictor = *encoded++;
-		}
-		state[0]->iDelta = ((encoded[1]<<8)|encoded[0]);
-		encoded += sizeof(Sint16);
-		if ( stereo ) {
-			state[1]->iDelta = ((encoded[1]<<8)|encoded[0]);
-			encoded += sizeof(Sint16);
-		}
-		state[0]->iSamp1 = ((encoded[1]<<8)|encoded[0]);
-		encoded += sizeof(Sint16);
-		if ( stereo ) {
-			state[1]->iSamp1 = ((encoded[1]<<8)|encoded[0]);
-			encoded += sizeof(Sint16);
-		}
-		state[0]->iSamp2 = ((encoded[1]<<8)|encoded[0]);
-		encoded += sizeof(Sint16);
-		if ( stereo ) {
-			state[1]->iSamp2 = ((encoded[1]<<8)|encoded[0]);
-			encoded += sizeof(Sint16);
-		}
-		coeff[0] = MS_ADPCM_state.aCoeff[state[0]->hPredictor];
-		coeff[1] = MS_ADPCM_state.aCoeff[state[1]->hPredictor];
-
-		/* Store the two initial samples we start with */
-		decoded[0] = state[0]->iSamp2&0xFF;
-		decoded[1] = state[0]->iSamp2>>8;
-		decoded += 2;
-		if ( stereo ) {
-			decoded[0] = state[1]->iSamp2&0xFF;
-			decoded[1] = state[1]->iSamp2>>8;
-			decoded += 2;
-		}
-		decoded[0] = state[0]->iSamp1&0xFF;
-		decoded[1] = state[0]->iSamp1>>8;
-		decoded += 2;
-		if ( stereo ) {
-			decoded[0] = state[1]->iSamp1&0xFF;
-			decoded[1] = state[1]->iSamp1>>8;
-			decoded += 2;
-		}
-
-		/* Decode and store the other samples in this block */
-		samplesleft = (MS_ADPCM_state.wSamplesPerBlock-2)*
-					MS_ADPCM_state.wavefmt.channels;
-		while ( samplesleft > 0 ) {
-			nybble = (*encoded)>>4;
-			new_sample = MS_ADPCM_nibble(state[0],nybble,coeff[0]);
-			decoded[0] = new_sample&0xFF;
-			new_sample >>= 8;
-			decoded[1] = new_sample&0xFF;
-			decoded += 2;
-
-			nybble = (*encoded)&0x0F;
-			new_sample = MS_ADPCM_nibble(state[1],nybble,coeff[1]);
-			decoded[0] = new_sample&0xFF;
-			new_sample >>= 8;
-			decoded[1] = new_sample&0xFF;
-			decoded += 2;
-
-			++encoded;
-			samplesleft -= 2;
-		}
-		encoded_len -= MS_ADPCM_state.wavefmt.blockalign;
-	}
-	SDL_free(freeable);
-	return(0);
-}
-
-struct IMA_ADPCM_decodestate {
-	Sint32 sample;
-	Sint8 index;
-};
-static struct IMA_ADPCM_decoder {
-	WaveFMT wavefmt;
-	Uint16 wSamplesPerBlock;
-	/* * * */
-	struct IMA_ADPCM_decodestate state[2];
-} IMA_ADPCM_state;
-
-static int InitIMA_ADPCM(WaveFMT *format)
-{
-	Uint8 *rogue_feel;
-
-	/* Set the rogue pointer to the IMA_ADPCM specific data */
-	IMA_ADPCM_state.wavefmt.encoding = SDL_SwapLE16(format->encoding);
-	IMA_ADPCM_state.wavefmt.channels = SDL_SwapLE16(format->channels);
-	IMA_ADPCM_state.wavefmt.frequency = SDL_SwapLE32(format->frequency);
-	IMA_ADPCM_state.wavefmt.byterate = SDL_SwapLE32(format->byterate);
-	IMA_ADPCM_state.wavefmt.blockalign = SDL_SwapLE16(format->blockalign);
-	IMA_ADPCM_state.wavefmt.bitspersample =
-					 SDL_SwapLE16(format->bitspersample);
-	rogue_feel = (Uint8 *)format+sizeof(*format);
-	if ( sizeof(*format) == 16 ) {
-		rogue_feel += sizeof(Uint16);
-	}
-	IMA_ADPCM_state.wSamplesPerBlock = ((rogue_feel[1]<<8)|rogue_feel[0]);
-	return(0);
-}
-
-static Sint32 IMA_ADPCM_nibble(struct IMA_ADPCM_decodestate *state,Uint8 nybble)
-{
-	const Sint32 max_audioval = ((1<<(16-1))-1);
-	const Sint32 min_audioval = -(1<<(16-1));
-	const int index_table[16] = {
-		-1, -1, -1, -1,
-		 2,  4,  6,  8,
-		-1, -1, -1, -1,
-		 2,  4,  6,  8
-	};
-	const Sint32 step_table[89] = {
-		7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 19, 21, 23, 25, 28, 31,
-		34, 37, 41, 45, 50, 55, 60, 66, 73, 80, 88, 97, 107, 118, 130,
-		143, 157, 173, 190, 209, 230, 253, 279, 307, 337, 371, 408,
-		449, 494, 544, 598, 658, 724, 796, 876, 963, 1060, 1166, 1282,
-		1411, 1552, 1707, 1878, 2066, 2272, 2499, 2749, 3024, 3327,
-		3660, 4026, 4428, 4871, 5358, 5894, 6484, 7132, 7845, 8630,
-		9493, 10442, 11487, 12635, 13899, 15289, 16818, 18500, 20350,
-		22385, 24623, 27086, 29794, 32767
-	};
-	Sint32 delta, step;
-
-	/* Compute difference and new sample value */
-	step = step_table[state->index];
-	delta = step >> 3;
-	if ( nybble & 0x04 ) delta += step;
-	if ( nybble & 0x02 ) delta += (step >> 1);
-	if ( nybble & 0x01 ) delta += (step >> 2);
-	if ( nybble & 0x08 ) delta = -delta;
-	state->sample += delta;
-
-	/* Update index value */
-	state->index += index_table[nybble];
-	if ( state->index > 88 ) {
-		state->index = 88;
-	} else
-	if ( state->index < 0 ) {
-		state->index = 0;
-	}
-
-	/* Clamp output sample */
-	if ( state->sample > max_audioval ) {
-		state->sample = max_audioval;
-	} else
-	if ( state->sample < min_audioval ) {
-		state->sample = min_audioval;
-	}
-	return(state->sample);
-}
-
-/* Fill the decode buffer with a channel block of data (8 samples) */
-static void Fill_IMA_ADPCM_block(Uint8 *decoded, Uint8 *encoded,
-	int channel, int numchannels, struct IMA_ADPCM_decodestate *state)
-{
-	int i;
-	Sint8 nybble;
-	Sint32 new_sample;
-
-	decoded += (channel * 2);
-	for ( i=0; i<4; ++i ) {
-		nybble = (*encoded)&0x0F;
-		new_sample = IMA_ADPCM_nibble(state, nybble);
-		decoded[0] = new_sample&0xFF;
-		new_sample >>= 8;
-		decoded[1] = new_sample&0xFF;
-		decoded += 2 * numchannels;
-
-		nybble = (*encoded)>>4;
-		new_sample = IMA_ADPCM_nibble(state, nybble);
-		decoded[0] = new_sample&0xFF;
-		new_sample >>= 8;
-		decoded[1] = new_sample&0xFF;
-		decoded += 2 * numchannels;
-
-		++encoded;
-	}
-}
-
-static int IMA_ADPCM_decode(Uint8 **audio_buf, Uint32 *audio_len)
-{
-	struct IMA_ADPCM_decodestate *state;
-	Uint8 *freeable, *encoded, *decoded;
-	Sint32 encoded_len, samplesleft;
-	unsigned int c, channels;
-
-	/* Check to make sure we have enough variables in the state array */
-	channels = IMA_ADPCM_state.wavefmt.channels;
-	if ( channels > SDL_arraysize(IMA_ADPCM_state.state) ) {
-		SDL_SetError("IMA ADPCM decoder can only handle %d channels",
-					SDL_arraysize(IMA_ADPCM_state.state));
-		return(-1);
-	}
-	state = IMA_ADPCM_state.state;
-
-	/* Allocate the proper sized output buffer */
-	encoded_len = *audio_len;
-	encoded = *audio_buf;
-	freeable = *audio_buf;
-	*audio_len = (encoded_len/IMA_ADPCM_state.wavefmt.blockalign) * 
-				IMA_ADPCM_state.wSamplesPerBlock*
-				IMA_ADPCM_state.wavefmt.channels*sizeof(Sint16);
-	*audio_buf = (Uint8 *)SDL_malloc(*audio_len);
-	if ( *audio_buf == NULL ) {
-		SDL_Error(SDL_ENOMEM);
-		return(-1);
-	}
-	decoded = *audio_buf;
-
-	/* Get ready... Go! */
-	while ( encoded_len >= IMA_ADPCM_state.wavefmt.blockalign ) {
-		/* Grab the initial information for this block */
-		for ( c=0; c<channels; ++c ) {
-			/* Fill the state information for this block */
-			state[c].sample = ((encoded[1]<<8)|encoded[0]);
-			encoded += 2;
-			if ( state[c].sample & 0x8000 ) {
-				state[c].sample -= 0x10000;
-			}
-			state[c].index = *encoded++;
-			/* Reserved byte in buffer header, should be 0 */
-			if ( *encoded++ != 0 ) {
-				/* Uh oh, corrupt data?  Buggy code? */;
-			}
-
-			/* Store the initial sample we start with */
-			decoded[0] = (Uint8)(state[c].sample&0xFF);
-			decoded[1] = (Uint8)(state[c].sample>>8);
-			decoded += 2;
-		}
-
-		/* Decode and store the other samples in this block */
-		samplesleft = (IMA_ADPCM_state.wSamplesPerBlock-1)*channels;
-		while ( samplesleft > 0 ) {
-			for ( c=0; c<channels; ++c ) {
-				Fill_IMA_ADPCM_block(decoded, encoded,
-						c, channels, &state[c]);
-				encoded += 4;
-				samplesleft -= 8;
-			}
-			decoded += (channels * 8 * 2);
-		}
-		encoded_len -= IMA_ADPCM_state.wavefmt.blockalign;
-	}
-	SDL_free(freeable);
-	return(0);
-}
-
-SDL_AudioSpec * SDL_LoadWAV_RW (SDL_RWops *src, int freesrc,
-		SDL_AudioSpec *spec, Uint8 **audio_buf, Uint32 *audio_len)
-{
-	int was_error;
-	Chunk chunk;
-	int lenread;
-	int MS_ADPCM_encoded, IMA_ADPCM_encoded;
-	int samplesize;
-
-	/* WAV magic header */
-	Uint32 RIFFchunk;
-	Uint32 wavelen = 0;
-	Uint32 WAVEmagic;
-	Uint32 headerDiff = 0;
-
-	/* FMT chunk */
-	WaveFMT *format = NULL;
-
-	/* Make sure we are passed a valid data source */
-	was_error = 0;
-	if ( src == NULL ) {
-		was_error = 1;
-		goto done;
-	}
-		
-	/* Check the magic header */
-	RIFFchunk	= SDL_ReadLE32(src);
-	wavelen		= SDL_ReadLE32(src);
-	if ( wavelen == WAVE ) { /* The RIFFchunk has already been read */
-		WAVEmagic = wavelen;
-		wavelen   = RIFFchunk;
-		RIFFchunk = RIFF;
-	} else {
-		WAVEmagic = SDL_ReadLE32(src);
-	}
-	if ( (RIFFchunk != RIFF) || (WAVEmagic != WAVE) ) {
-		SDL_SetError("Unrecognized file type (not WAVE)");
-		was_error = 1;
-		goto done;
-	}
-	headerDiff += sizeof(Uint32); /* for WAVE */
-
-	/* Read the audio data format chunk */
-	chunk.data = NULL;
-	do {
-		if ( chunk.data != NULL ) {
-			SDL_free(chunk.data);
-			chunk.data = NULL;
-		}
-		lenread = ReadChunk(src, &chunk);
-		if ( lenread < 0 ) {
-			was_error = 1;
-			goto done;
-		}
-		/* 2 Uint32's for chunk header+len, plus the lenread */
-		headerDiff += lenread + 2 * sizeof(Uint32);
-	} while ( (chunk.magic == FACT) || (chunk.magic == LIST) );
-
-	/* Decode the audio data format */
-	format = (WaveFMT *)chunk.data;
-	if ( chunk.magic != FMT ) {
-		SDL_SetError("Complex WAVE files not supported");
-		was_error = 1;
-		goto done;
-	}
-	MS_ADPCM_encoded = IMA_ADPCM_encoded = 0;
-	switch (SDL_SwapLE16(format->encoding)) {
-		case PCM_CODE:
-			/* We can understand this */
-			break;
-		case MS_ADPCM_CODE:
-			/* Try to understand this */
-			if ( InitMS_ADPCM(format) < 0 ) {
-				was_error = 1;
-				goto done;
-			}
-			MS_ADPCM_encoded = 1;
-			break;
-		case IMA_ADPCM_CODE:
-			/* Try to understand this */
-			if ( InitIMA_ADPCM(format) < 0 ) {
-				was_error = 1;
-				goto done;
-			}
-			IMA_ADPCM_encoded = 1;
-			break;
-		case MP3_CODE:
-			SDL_SetError("MPEG Layer 3 data not supported",
-					SDL_SwapLE16(format->encoding));
-			was_error = 1;
-			goto done;
-		default:
-			SDL_SetError("Unknown WAVE data format: 0x%.4x",
-					SDL_SwapLE16(format->encoding));
-			was_error = 1;
-			goto done;
-	}
-	SDL_memset(spec, 0, (sizeof *spec));
-	spec->freq = SDL_SwapLE32(format->frequency);
-	switch (SDL_SwapLE16(format->bitspersample)) {
-		case 4:
-			if ( MS_ADPCM_encoded || IMA_ADPCM_encoded ) {
-				spec->format = AUDIO_S16;
-			} else {
-				was_error = 1;
-			}
-			break;
-		case 8:
-			spec->format = AUDIO_U8;
-			break;
-		case 16:
-			spec->format = AUDIO_S16;
-			break;
-		default:
-			was_error = 1;
-			break;
-	}
-	if ( was_error ) {
-		SDL_SetError("Unknown %d-bit PCM data format",
-			SDL_SwapLE16(format->bitspersample));
-		goto done;
-	}
-	spec->channels = (Uint8)SDL_SwapLE16(format->channels);
-	spec->samples = 4096;		/* Good default buffer size */
-
-	/* Read the audio data chunk */
-	*audio_buf = NULL;
-	do {
-		if ( *audio_buf != NULL ) {
-			SDL_free(*audio_buf);
-			*audio_buf = NULL;
-		}
-		lenread = ReadChunk(src, &chunk);
-		if ( lenread < 0 ) {
-			was_error = 1;
-			goto done;
-		}
-		*audio_len = lenread;
-		*audio_buf = chunk.data;
-		if(chunk.magic != DATA) headerDiff += lenread + 2 * sizeof(Uint32);
-	} while ( chunk.magic != DATA );
-	headerDiff += 2 * sizeof(Uint32); /* for the data chunk and len */
-
-	if ( MS_ADPCM_encoded ) {
-		if ( MS_ADPCM_decode(audio_buf, audio_len) < 0 ) {
-			was_error = 1;
-			goto done;
-		}
-	}
-	if ( IMA_ADPCM_encoded ) {
-		if ( IMA_ADPCM_decode(audio_buf, audio_len) < 0 ) {
-			was_error = 1;
-			goto done;
-		}
-	}
-
-	/* Don't return a buffer that isn't a multiple of samplesize */
-	samplesize = ((spec->format & 0xFF)/8)*spec->channels;
-	*audio_len &= ~(samplesize-1);
-
-done:
-	if ( format != NULL ) {
-		SDL_free(format);
-	}
-	if ( src ) {
-		if ( freesrc ) {
-			SDL_RWclose(src);
-		} else {
-			/* seek to the end of the file (given by the RIFF chunk) */
-			SDL_RWseek(src, wavelen - chunk.length - headerDiff, RW_SEEK_CUR);
-		}
-	}
-	if ( was_error ) {
-		spec = NULL;
-	}
-	return(spec);
-}
-
-/* Since the WAV memory is allocated in the shared library, it must also
-   be freed here.  (Necessary under Win32, VC++)
- */
-void SDL_FreeWAV(Uint8 *audio_buf)
-{
-	if ( audio_buf != NULL ) {
-		SDL_free(audio_buf);
-	}
-}
-
-static int ReadChunk(SDL_RWops *src, Chunk *chunk)
-{
-	chunk->magic	= SDL_ReadLE32(src);
-	chunk->length	= SDL_ReadLE32(src);
-	chunk->data = (Uint8 *)SDL_malloc(chunk->length);
-	if ( chunk->data == NULL ) {
-		SDL_Error(SDL_ENOMEM);
-		return(-1);
-	}
-	if ( SDL_RWread(src, chunk->data, chunk->length, 1) != 1 ) {
-		SDL_Error(SDL_EFREAD);
-		SDL_free(chunk->data);
-		chunk->data = NULL;
-		return(-1);
-	}
-	return(chunk->length);
-}
diff --git a/src/audio/SDL_wave.h b/src/audio/SDL_wave.h
deleted file mode 100644
index 53b12e7..0000000
--- a/src/audio/SDL_wave.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is SDL_free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/* WAVE files are little-endian */
-
-/*******************************************/
-/* Define values for Microsoft WAVE format */
-/*******************************************/
-#define RIFF		0x46464952		/* "RIFF" */
-#define WAVE		0x45564157		/* "WAVE" */
-#define FACT		0x74636166		/* "fact" */
-#define LIST		0x5453494c		/* "LIST" */
-#define FMT		0x20746D66		/* "fmt " */
-#define DATA		0x61746164		/* "data" */
-#define PCM_CODE	0x0001
-#define MS_ADPCM_CODE	0x0002
-#define IMA_ADPCM_CODE	0x0011
-#define MP3_CODE	0x0055
-#define WAVE_MONO	1
-#define WAVE_STEREO	2
-
-/* Normally, these three chunks come consecutively in a WAVE file */
-typedef struct WaveFMT {
-/* Not saved in the chunk we read:
-	Uint32	FMTchunk;
-	Uint32	fmtlen;
-*/
-	Uint16	encoding;	
-	Uint16	channels;		/* 1 = mono, 2 = stereo */
-	Uint32	frequency;		/* One of 11025, 22050, or 44100 Hz */
-	Uint32	byterate;		/* Average bytes per second */
-	Uint16	blockalign;		/* Bytes per sample block */
-	Uint16	bitspersample;		/* One of 8, 12, 16, or 4 for ADPCM */
-} WaveFMT;
-
-/* The general chunk found in the WAVE file */
-typedef struct Chunk {
-	Uint32 magic;
-	Uint32 length;
-	Uint8 *data;
-} Chunk;
-
diff --git a/src/audio/SDL_wave.o b/src/audio/SDL_wave.o
deleted file mode 100644
index 1aa48cb..0000000
Binary files a/src/audio/SDL_wave.o and /dev/null differ
diff --git a/src/audio/dummy/SDL_dummyaudio.c b/src/audio/dummy/SDL_dummyaudio.c
deleted file mode 100644
index 484b50d..0000000
--- a/src/audio/dummy/SDL_dummyaudio.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-
-    This file written by Ryan C. Gordon (icculus@icculus.org)
-*/
-#include "SDL_config.h"
-
-/* Output audio to nowhere... */
-
-#include "SDL_rwops.h"
-#include "SDL_timer.h"
-#include "SDL_audio.h"
-#include "../SDL_audiomem.h"
-#include "../SDL_audio_c.h"
-#include "../SDL_audiodev_c.h"
-#include "SDL_dummyaudio.h"
-
-/* The tag name used by DUMMY audio */
-#define DUMMYAUD_DRIVER_NAME         "dummy"
-
-/* Audio driver functions */
-static int DUMMYAUD_OpenAudio(_THIS, SDL_AudioSpec *spec);
-static void DUMMYAUD_WaitAudio(_THIS);
-static void DUMMYAUD_PlayAudio(_THIS);
-static Uint8 *DUMMYAUD_GetAudioBuf(_THIS);
-static void DUMMYAUD_CloseAudio(_THIS);
-
-/* Audio driver bootstrap functions */
-static int DUMMYAUD_Available(void)
-{
-	const char *envr = SDL_getenv("SDL_AUDIODRIVER");
-	if (envr && (SDL_strcmp(envr, DUMMYAUD_DRIVER_NAME) == 0)) {
-		return(1);
-	}
-	return(0);
-}
-
-static void DUMMYAUD_DeleteDevice(SDL_AudioDevice *device)
-{
-	SDL_free(device->hidden);
-	SDL_free(device);
-}
-
-static SDL_AudioDevice *DUMMYAUD_CreateDevice(int devindex)
-{
-	SDL_AudioDevice *this;
-
-	/* Initialize all variables that we clean on shutdown */
-	this = (SDL_AudioDevice *)SDL_malloc(sizeof(SDL_AudioDevice));
-	if ( this ) {
-		SDL_memset(this, 0, (sizeof *this));
-		this->hidden = (struct SDL_PrivateAudioData *)
-				SDL_malloc((sizeof *this->hidden));
-	}
-	if ( (this == NULL) || (this->hidden == NULL) ) {
-		SDL_OutOfMemory();
-		if ( this ) {
-			SDL_free(this);
-		}
-		return(0);
-	}
-	SDL_memset(this->hidden, 0, (sizeof *this->hidden));
-
-	/* Set the function pointers */
-	this->OpenAudio = DUMMYAUD_OpenAudio;
-	this->WaitAudio = DUMMYAUD_WaitAudio;
-	this->PlayAudio = DUMMYAUD_PlayAudio;
-	this->GetAudioBuf = DUMMYAUD_GetAudioBuf;
-	this->CloseAudio = DUMMYAUD_CloseAudio;
-
-	this->free = DUMMYAUD_DeleteDevice;
-
-	return this;
-}
-
-AudioBootStrap DUMMYAUD_bootstrap = {
-	DUMMYAUD_DRIVER_NAME, "SDL dummy audio driver",
-	DUMMYAUD_Available, DUMMYAUD_CreateDevice
-};
-
-/* This function waits until it is possible to write a full sound buffer */
-static void DUMMYAUD_WaitAudio(_THIS)
-{
-	/* Don't block on first calls to simulate initial fragment filling. */
-	if (this->hidden->initial_calls)
-		this->hidden->initial_calls--;
-	else
-		SDL_Delay(this->hidden->write_delay);
-}
-
-static void DUMMYAUD_PlayAudio(_THIS)
-{
-	/* no-op...this is a null driver. */
-}
-
-static Uint8 *DUMMYAUD_GetAudioBuf(_THIS)
-{
-	return(this->hidden->mixbuf);
-}
-
-static void DUMMYAUD_CloseAudio(_THIS)
-{
-	if ( this->hidden->mixbuf != NULL ) {
-		SDL_FreeAudioMem(this->hidden->mixbuf);
-		this->hidden->mixbuf = NULL;
-	}
-}
-
-static int DUMMYAUD_OpenAudio(_THIS, SDL_AudioSpec *spec)
-{
-	float bytes_per_sec = 0.0f;
-
-	/* Allocate mixing buffer */
-	this->hidden->mixlen = spec->size;
-	this->hidden->mixbuf = (Uint8 *) SDL_AllocAudioMem(this->hidden->mixlen);
-	if ( this->hidden->mixbuf == NULL ) {
-		return(-1);
-	}
-	SDL_memset(this->hidden->mixbuf, spec->silence, spec->size);
-
-	bytes_per_sec = (float) (((spec->format & 0xFF) / 8) *
-	                   spec->channels * spec->freq);
-
-	/*
-	 * We try to make this request more audio at the correct rate for
-	 *  a given audio spec, so timing stays fairly faithful.
-	 * Also, we have it not block at all for the first two calls, so
-	 *  it seems like we're filling two audio fragments right out of the
-	 *  gate, like other SDL drivers tend to do.
-	 */
-	this->hidden->initial_calls = 2;
-	this->hidden->write_delay =
-	               (Uint32) ((((float) spec->size) / bytes_per_sec) * 1000.0f);
-
-	/* We're ready to rock and roll. :-) */
-	return(0);
-}
-
diff --git a/src/audio/dummy/SDL_dummyaudio.h b/src/audio/dummy/SDL_dummyaudio.h
deleted file mode 100644
index 74a69ca..0000000
--- a/src/audio/dummy/SDL_dummyaudio.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-#ifndef _SDL_dummyaudio_h
-#define _SDL_dummyaudio_h
-
-#include "../SDL_sysaudio.h"
-
-/* Hidden "this" pointer for the video functions */
-#define _THIS	SDL_AudioDevice *this
-
-struct SDL_PrivateAudioData {
-	/* The file descriptor for the audio device */
-	Uint8 *mixbuf;
-	Uint32 mixlen;
-	Uint32 write_delay;
-	Uint32 initial_calls;
-};
-
-#endif /* _SDL_dummyaudio_h */
diff --git a/src/audio/dummy/SDL_dummyaudio.o b/src/audio/dummy/SDL_dummyaudio.o
deleted file mode 100644
index 9894b3d..0000000
Binary files a/src/audio/dummy/SDL_dummyaudio.o and /dev/null differ
diff --git a/src/cdrom/SDL_cdrom.c b/src/cdrom/SDL_cdrom.c
deleted file mode 100644
index 8f91bb1..0000000
--- a/src/cdrom/SDL_cdrom.c
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/* This is the CD-audio control API for Simple DirectMedia Layer */
-
-#include "SDL_cdrom.h"
-#include "SDL_syscdrom.h"
-
-#if !defined(__MACOS__)
-#define CLIP_FRAMES	10	/* Some CD-ROMs won't go all the way */
-#endif
-
-static int SDL_cdinitted = 0;
-static SDL_CD *default_cdrom;
-
-/* The system level CD-ROM control functions */
-struct CDcaps SDL_CDcaps = {
-	NULL,					/* Name */
-	NULL,					/* Open */
-	NULL,					/* GetTOC */
-	NULL,					/* Status */
-	NULL,					/* Play */
-	NULL,					/* Pause */
-	NULL,					/* Resume */
-	NULL,					/* Stop */
-	NULL,					/* Eject */
-	NULL,					/* Close */
-};
-int SDL_numcds;
-
-int SDL_CDROMInit(void)
-{
-	int retval;
-
-	SDL_numcds = 0;
-	retval = SDL_SYS_CDInit();
-	if ( retval == 0 ) {
-		SDL_cdinitted = 1;
-	}
-	default_cdrom = NULL;
-	return(retval);
-}
-
-/* Check to see if the CD-ROM subsystem has been initialized */
-static int CheckInit(int check_cdrom, SDL_CD **cdrom)
-{
-	int okay;
-
-	okay = SDL_cdinitted;
-	if ( check_cdrom && (*cdrom == NULL) ) {
-		*cdrom = default_cdrom;
-		if ( *cdrom == NULL ) {
-			SDL_SetError("CD-ROM not opened");
-			okay = 0;
-		}
-	}
-	if ( ! SDL_cdinitted ) {
-		SDL_SetError("CD-ROM subsystem not initialized");
-	}
-	return(okay);
-}
-
-int SDL_CDNumDrives(void)
-{
-	if ( ! CheckInit(0, NULL) ) {
-		return(-1);
-	}
-	return(SDL_numcds);
-}
-
-const char *SDL_CDName(int drive)
-{
-	if ( ! CheckInit(0, NULL) ) {
-		return(NULL);
-	}
-	if ( drive >= SDL_numcds ) {
-		SDL_SetError("Invalid CD-ROM drive index");
-		return(NULL);
-	}
-	if ( SDL_CDcaps.Name ) {
-		return(SDL_CDcaps.Name(drive));
-	} else {
-		return("");
-	}
-}
-
-SDL_CD *SDL_CDOpen(int drive)
-{
-	struct SDL_CD *cdrom;
-
-	if ( ! CheckInit(0, NULL) ) {
-		return(NULL);
-	}
-	if ( drive >= SDL_numcds ) {
-		SDL_SetError("Invalid CD-ROM drive index");
-		return(NULL);
-	}
-	cdrom = (SDL_CD *)SDL_malloc(sizeof(*cdrom));
-	if ( cdrom == NULL ) {
-		SDL_OutOfMemory();
-		return(NULL);
-	}
-	SDL_memset(cdrom, 0, sizeof(*cdrom));
-	cdrom->id = SDL_CDcaps.Open(drive);
-	if ( cdrom->id < 0 ) {
-		SDL_free(cdrom);
-		return(NULL);
-	}
-	default_cdrom = cdrom;
-	return(cdrom);
-}
-
-CDstatus SDL_CDStatus(SDL_CD *cdrom)
-{
-	CDstatus status;
-	int i;
-	Uint32 position;
-
-	/* Check if the CD-ROM subsystem has been initialized */
-	if ( ! CheckInit(1, &cdrom) ) {
-		return(CD_ERROR);
-	}
-
-	/* Get the current status of the drive */
-	cdrom->numtracks = 0;
-	cdrom->cur_track = 0;
-	cdrom->cur_frame = 0;
-	status = SDL_CDcaps.Status(cdrom, &i);
-	position = (Uint32)i;
-	cdrom->status = status;
-
-	/* Get the table of contents, if there's a CD available */
-	if ( CD_INDRIVE(status) ) {
-		if ( SDL_CDcaps.GetTOC(cdrom) < 0 ) {
-			status = CD_ERROR;
-		}
-		/* If the drive is playing, get current play position */
-		if ( (status == CD_PLAYING) || (status == CD_PAUSED) ) {
-			for ( i=1; cdrom->track[i].offset <= position; ++i ) {
-				/* Keep looking */;
-			}
-#ifdef DEBUG_CDROM
-  fprintf(stderr, "Current position: %d, track = %d (offset is %d)\n",
-				position, i-1, cdrom->track[i-1].offset);
-#endif
-			cdrom->cur_track = i-1;
-			position -= cdrom->track[cdrom->cur_track].offset;
-			cdrom->cur_frame = position;
-		}
-	}
-	return(status);
-}
-
-int SDL_CDPlayTracks(SDL_CD *cdrom,
-			int strack, int sframe, int ntracks, int nframes)
-{
-	int etrack, eframe;
-	int start, length;
-
-	/* Check if the CD-ROM subsystem has been initialized */
-	if ( ! CheckInit(1, &cdrom) ) {
-		return(CD_ERROR);
-	}
-
-	/* Determine the starting and ending tracks */
-	if ( (strack < 0) || (strack >= cdrom->numtracks) ) {
-		SDL_SetError("Invalid starting track");
-		return(CD_ERROR);
-	}
-	if ( ! ntracks && ! nframes ) {
-		etrack = cdrom->numtracks;
-		eframe = 0;
-	} else {
-		etrack = strack+ntracks;
-		if ( etrack == strack ) {
-			eframe = sframe + nframes;
-		} else {
-			eframe = nframes;
-		}
-	}
-	if ( etrack > cdrom->numtracks ) {
-		SDL_SetError("Invalid play length");
-		return(CD_ERROR);
-	}
-
-	/* Skip data tracks and verify frame offsets */
-	while ( (strack <= etrack) &&
-			(cdrom->track[strack].type == SDL_DATA_TRACK) ) {
-		++strack;
-	}
-	if ( sframe >= (int)cdrom->track[strack].length ) {
-		SDL_SetError("Invalid starting frame for track %d", strack);
-		return(CD_ERROR);
-	}
-	while ( (etrack > strack) &&
-			(cdrom->track[etrack-1].type == SDL_DATA_TRACK) ) {
-		--etrack;
-	}
-	if ( eframe > (int)cdrom->track[etrack].length ) {
-		SDL_SetError("Invalid ending frame for track %d", etrack);
-		return(CD_ERROR);
-	}
-
-	/* Determine start frame and play length */
-	start = (cdrom->track[strack].offset+sframe);
-	length = (cdrom->track[etrack].offset+eframe)-start;
-#ifdef CLIP_FRAMES
-	/* I've never seen this necessary, but xmcd does it.. */
-	length -= CLIP_FRAMES;	/* CLIP_FRAMES == 10 */
-#endif
-	if ( length < 0 ) {
-		return(0);
-	}
-
-	/* Play! */
-#ifdef DEBUG_CDROM
-  fprintf(stderr, "Playing %d frames at offset %d\n", length, start);
-#endif
-	return(SDL_CDcaps.Play(cdrom, start, length));
-}
-
-int SDL_CDPlay(SDL_CD *cdrom, int sframe, int length)
-{
-	/* Check if the CD-ROM subsystem has been initialized */
-	if ( ! CheckInit(1, &cdrom) ) {
-		return(CD_ERROR);
-	}
-
-	return(SDL_CDcaps.Play(cdrom, sframe, length));
-}
-
-int SDL_CDPause(SDL_CD *cdrom)
-{
-	CDstatus status;
-	int retval;
-
-	/* Check if the CD-ROM subsystem has been initialized */
-	if ( ! CheckInit(1, &cdrom) ) {
-		return(CD_ERROR);
-	}
-
-	status = SDL_CDcaps.Status(cdrom, NULL);
-	switch (status) {
-		case CD_PLAYING:
-			retval = SDL_CDcaps.Pause(cdrom);
-			break;
-		default:
-			retval = 0;
-			break;
-	}
-	return(retval);
-}
-
-int SDL_CDResume(SDL_CD *cdrom)
-{
-	CDstatus status;
-	int retval;
-
-	/* Check if the CD-ROM subsystem has been initialized */
-	if ( ! CheckInit(1, &cdrom) ) {
-		return(CD_ERROR);
-	}
-
-	status = SDL_CDcaps.Status(cdrom, NULL);
-	switch (status) {
-		case CD_PAUSED:
-			retval = SDL_CDcaps.Resume(cdrom);
-		default:
-			retval = 0;
-			break;
-	}
-	return(retval);
-}
-
-int SDL_CDStop(SDL_CD *cdrom)
-{
-	CDstatus status;
-	int retval;
-
-	/* Check if the CD-ROM subsystem has been initialized */
-	if ( ! CheckInit(1, &cdrom) ) {
-		return(CD_ERROR);
-	}
-
-	status = SDL_CDcaps.Status(cdrom, NULL);
-	switch (status) {
-		case CD_PLAYING:
-		case CD_PAUSED:
-			retval = SDL_CDcaps.Stop(cdrom);
-		default:
-			retval = 0;
-			break;
-	}
-	return(retval);
-}
-
-int SDL_CDEject(SDL_CD *cdrom)
-{
-	/* Check if the CD-ROM subsystem has been initialized */
-	if ( ! CheckInit(1, &cdrom) ) {
-		return(CD_ERROR);
-	}
-	return(SDL_CDcaps.Eject(cdrom));
-}
-
-void SDL_CDClose(SDL_CD *cdrom)
-{
-	/* Check if the CD-ROM subsystem has been initialized */
-	if ( ! CheckInit(1, &cdrom) ) {
-		return;
-	}
-	SDL_CDcaps.Close(cdrom);
-	SDL_free(cdrom);
-	default_cdrom = NULL;
-}
-
-void SDL_CDROMQuit(void)
-{
-	SDL_SYS_CDQuit();
-	SDL_cdinitted = 0;
-}
diff --git a/src/cdrom/SDL_cdrom.o b/src/cdrom/SDL_cdrom.o
deleted file mode 100644
index 372d3ad..0000000
Binary files a/src/cdrom/SDL_cdrom.o and /dev/null differ
diff --git a/src/cdrom/SDL_syscdrom.h b/src/cdrom/SDL_syscdrom.h
deleted file mode 100644
index 0feeee5..0000000
--- a/src/cdrom/SDL_syscdrom.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is SDL_free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/* This is the system specific header for the SDL CD-ROM API */
-
-/* Structure of CD audio control functions */
-extern struct CDcaps {
-	/* Get the name of the specified drive */
-	const char *(*Name)(int drive);
-
-	/* Open the specified drive, returning a drive id, or -1 on error */
-	int (*Open)(int drive);
-
-	/* Get table-of-contents (number of tracks + track info) for disk.
-	   The TOC information should be stored in the cdrom structure.
-	   This function should return 0 on success, or -1 on error.
-	 */
-	int (*GetTOC)(SDL_CD *cdrom);
-
-	/* Return the current status and play position, in frames, of the
-	   drive.  'position' may be NULL, and if so, should be ignored.
-	 */
-	CDstatus (*Status)(SDL_CD *cdrom, int *position);
-
-	/* Play from frame 'start' to 'start+len' */
-	int (*Play)(SDL_CD *cdrom, int start, int len); 
-
-	/* Pause play */
-	int (*Pause)(SDL_CD *cdrom);
-
-	/* Resume play */
-	int (*Resume)(SDL_CD *cdrom);
-
-	/* Stop play */
-	int (*Stop)(SDL_CD *cdrom);
-
-	/* Eject the current disk */
-	int (*Eject)(SDL_CD *cdrom);
-
-	/* Close the specified drive */
-	void (*Close)(SDL_CD *cdrom);
-} SDL_CDcaps;
-
-/* The number of available CD-ROM drives on the system */
-extern int SDL_numcds;
-
-/* Function to scan the system for CD-ROM drives and fill SDL_CDcaps.
- * This function should set SDL_numcds to the number of available CD
- * drives.  Drive 0 should be the system default CD-ROM.
- * It should return 0, or -1 on an unrecoverable fatal error.
-*/
-extern int  SDL_SYS_CDInit(void);
-
-/* Function to perform any system-specific CD-ROM related cleanup */
-extern void SDL_SYS_CDQuit(void);
-
diff --git a/src/cdrom/dummy/SDL_syscdrom.c b/src/cdrom/dummy/SDL_syscdrom.c
deleted file mode 100644
index 9821e97..0000000
--- a/src/cdrom/dummy/SDL_syscdrom.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-#if defined(SDL_CDROM_DUMMY) || defined(SDL_CDROM_DISABLED)
-
-/* Stub functions for system-level CD-ROM audio control */
-
-#include "SDL_cdrom.h"
-#include "../SDL_syscdrom.h"
-
-int  SDL_SYS_CDInit(void)
-{
-	return(0);
-}
-
-void SDL_SYS_CDQuit(void)
-{
-	return;
-}
-
-#endif /* SDL_CDROM_DUMMY || SDL_CDROM_DISABLED */
diff --git a/src/cdrom/dummy/SDL_syscdrom.o b/src/cdrom/dummy/SDL_syscdrom.o
deleted file mode 100644
index 09d5574..0000000
Binary files a/src/cdrom/dummy/SDL_syscdrom.o and /dev/null differ
diff --git a/src/gfx/SDL_framerate.c b/src/gfx/SDL_framerate.c
new file mode 100644
index 0000000..6735f63
--- /dev/null
+++ b/src/gfx/SDL_framerate.c
@@ -0,0 +1,189 @@
+/*
+
+SDL_framerate.c: framerate manager
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#include "SDL_framerate.h"
+
+/*!
+\brief Internal wrapper to SDL_GetTicks that ensures a non-zero return value.
+
+\return The tick count.
+*/
+Uint32 _getTicks()
+{
+	Uint32 ticks = SDL_GetTicks();
+
+	/* 
+	* Since baseticks!=0 is used to track initialization
+	* we need to ensure that the tick count is always >0 
+	* since SDL_GetTicks may not have incremented yet and
+	* return 0 depending on the timing of the calls.
+	*/
+	if (ticks == 0) {
+		return 1;
+	} else {
+		return ticks;
+	}
+}
+
+/*!
+\brief Initialize the framerate manager.
+
+Initialize the framerate manager, set default framerate of 30Hz and
+reset delay interpolation.
+
+\param manager Pointer to the framerate manager.
+*/
+void SDL_initFramerate(FPSmanager * manager)
+{
+	/*
+	* Store some sane values 
+	*/
+	manager->framecount = 0;
+	manager->rate = FPS_DEFAULT;
+	manager->rateticks = (1000.0f / (float) FPS_DEFAULT);
+	manager->baseticks = _getTicks();
+	manager->lastticks = manager->baseticks;
+
+}
+
+/*!
+\brief Set the framerate in Hz 
+
+Sets a new framerate for the manager and reset delay interpolation.
+Rate values must be between FPS_LOWER_LIMIT and FPS_UPPER_LIMIT inclusive to be accepted.
+
+\param manager Pointer to the framerate manager.
+\param rate The new framerate in Hz (frames per second).
+
+\return 0 for sucess and -1 for error.
+*/
+int SDL_setFramerate(FPSmanager * manager, Uint32 rate)
+{
+	if ((rate >= FPS_LOWER_LIMIT) && (rate <= FPS_UPPER_LIMIT)) {
+		manager->framecount = 0;
+		manager->rate = rate;
+		manager->rateticks = (1000.0f / (float) rate);
+		return (0);
+	} else {
+		return (-1);
+	}
+}
+
+/*!
+\brief Return the current target framerate in Hz 
+
+Get the currently set framerate of the manager.
+
+\param manager Pointer to the framerate manager.
+
+\return Current framerate in Hz or -1 for error.
+*/
+int SDL_getFramerate(FPSmanager * manager)
+{
+	if (manager == NULL) {
+		return (-1);
+	} else {
+		return ((int)manager->rate);
+	}
+}
+
+/*!
+\brief Return the current framecount.
+
+Get the current framecount from the framerate manager. 
+A frame is counted each time SDL_framerateDelay is called.
+
+\param manager Pointer to the framerate manager.
+
+\return Current frame count or -1 for error.
+*/
+int SDL_getFramecount(FPSmanager * manager)
+{
+	if (manager == NULL) {
+		return (-1);
+	} else {
+		return ((int)manager->framecount);
+	}
+}
+
+/*!
+\brief Delay execution to maintain a constant framerate and calculate fps.
+
+Generate a delay to accomodate currently set framerate. Call once in the
+graphics/rendering loop. If the computer cannot keep up with the rate (i.e.
+drawing too slow), the delay is zero and the delay interpolation is reset.
+
+\param manager Pointer to the framerate manager.
+
+\return The time that passed since the last call to the function in ms. May return 0.
+*/
+Uint32 SDL_framerateDelay(FPSmanager * manager)
+{
+	Uint32 current_ticks;
+	Uint32 target_ticks;
+	Uint32 the_delay;
+	Uint32 time_passed = 0;
+
+	/*
+	* No manager, no delay
+	*/
+	if (manager == NULL) {
+		return 0;
+	}
+
+	/*
+	* Initialize uninitialized manager 
+	*/
+	if (manager->baseticks == 0) {
+		SDL_initFramerate(manager);
+	}
+
+	/*
+	* Next frame 
+	*/
+	manager->framecount++;
+
+	/*
+	* Get/calc ticks 
+	*/
+	current_ticks = _getTicks();
+	time_passed = current_ticks - manager->lastticks;
+	manager->lastticks = current_ticks;
+	target_ticks = manager->baseticks + (Uint32) ((float) manager->framecount * manager->rateticks);
+
+	if (current_ticks <= target_ticks) {
+		the_delay = target_ticks - current_ticks;
+		SDL_Delay(the_delay);
+	} else {
+		manager->framecount = 0;
+		manager->baseticks = _getTicks();
+	}
+
+	return time_passed;
+}
diff --git a/src/gfx/SDL_gfxBlitFunc.c b/src/gfx/SDL_gfxBlitFunc.c
new file mode 100644
index 0000000..1c0f68e
--- /dev/null
+++ b/src/gfx/SDL_gfxBlitFunc.c
@@ -0,0 +1,639 @@
+/* 
+
+SDL_gfxBlitFunc.c: custom blitters
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#include "SDL_gfxBlitFunc.h"
+
+/*!  
+\brief Alpha adjustment table for custom blitter.
+
+The table provides values for a modified, non-linear 
+transfer function which maintain brightness.
+
+*/
+const unsigned int GFX_ALPHA_ADJUST_ARRAY[256] = {
+	0,  /* 0 */
+	15,  /* 1 */
+	22,  /* 2 */
+	27,  /* 3 */
+	31,  /* 4 */
+	35,  /* 5 */
+	39,  /* 6 */
+	42,  /* 7 */
+	45,  /* 8 */
+	47,  /* 9 */
+	50,  /* 10 */
+	52,  /* 11 */
+	55,  /* 12 */
+	57,  /* 13 */
+	59,  /* 14 */
+	61,  /* 15 */
+	63,  /* 16 */
+	65,  /* 17 */
+	67,  /* 18 */
+	69,  /* 19 */
+	71,  /* 20 */
+	73,  /* 21 */
+	74,  /* 22 */
+	76,  /* 23 */
+	78,  /* 24 */
+	79,  /* 25 */
+	81,  /* 26 */
+	82,  /* 27 */
+	84,  /* 28 */
+	85,  /* 29 */
+	87,  /* 30 */
+	88,  /* 31 */
+	90,  /* 32 */
+	91,  /* 33 */
+	93,  /* 34 */
+	94,  /* 35 */
+	95,  /* 36 */
+	97,  /* 37 */
+	98,  /* 38 */
+	99,  /* 39 */
+	100,  /* 40 */
+	102,  /* 41 */
+	103,  /* 42 */
+	104,  /* 43 */
+	105,  /* 44 */
+	107,  /* 45 */
+	108,  /* 46 */
+	109,  /* 47 */
+	110,  /* 48 */
+	111,  /* 49 */
+	112,  /* 50 */
+	114,  /* 51 */
+	115,  /* 52 */
+	116,  /* 53 */
+	117,  /* 54 */
+	118,  /* 55 */
+	119,  /* 56 */
+	120,  /* 57 */
+	121,  /* 58 */
+	122,  /* 59 */
+	123,  /* 60 */
+	124,  /* 61 */
+	125,  /* 62 */
+	126,  /* 63 */
+	127,  /* 64 */
+	128,  /* 65 */
+	129,  /* 66 */
+	130,  /* 67 */
+	131,  /* 68 */
+	132,  /* 69 */
+	133,  /* 70 */
+	134,  /* 71 */
+	135,  /* 72 */
+	136,  /* 73 */
+	137,  /* 74 */
+	138,  /* 75 */
+	139,  /* 76 */
+	140,  /* 77 */
+	141,  /* 78 */
+	141,  /* 79 */
+	142,  /* 80 */
+	143,  /* 81 */
+	144,  /* 82 */
+	145,  /* 83 */
+	146,  /* 84 */
+	147,  /* 85 */
+	148,  /* 86 */
+	148,  /* 87 */
+	149,  /* 88 */
+	150,  /* 89 */
+	151,  /* 90 */
+	152,  /* 91 */
+	153,  /* 92 */
+	153,  /* 93 */
+	154,  /* 94 */
+	155,  /* 95 */
+	156,  /* 96 */
+	157,  /* 97 */
+	158,  /* 98 */
+	158,  /* 99 */
+	159,  /* 100 */
+	160,  /* 101 */
+	161,  /* 102 */
+	162,  /* 103 */
+	162,  /* 104 */
+	163,  /* 105 */
+	164,  /* 106 */
+	165,  /* 107 */
+	165,  /* 108 */
+	166,  /* 109 */
+	167,  /* 110 */
+	168,  /* 111 */
+	168,  /* 112 */
+	169,  /* 113 */
+	170,  /* 114 */
+	171,  /* 115 */
+	171,  /* 116 */
+	172,  /* 117 */
+	173,  /* 118 */
+	174,  /* 119 */
+	174,  /* 120 */
+	175,  /* 121 */
+	176,  /* 122 */
+	177,  /* 123 */
+	177,  /* 124 */
+	178,  /* 125 */
+	179,  /* 126 */
+	179,  /* 127 */
+	180,  /* 128 */
+	181,  /* 129 */
+	182,  /* 130 */
+	182,  /* 131 */
+	183,  /* 132 */
+	184,  /* 133 */
+	184,  /* 134 */
+	185,  /* 135 */
+	186,  /* 136 */
+	186,  /* 137 */
+	187,  /* 138 */
+	188,  /* 139 */
+	188,  /* 140 */
+	189,  /* 141 */
+	190,  /* 142 */
+	190,  /* 143 */
+	191,  /* 144 */
+	192,  /* 145 */
+	192,  /* 146 */
+	193,  /* 147 */
+	194,  /* 148 */
+	194,  /* 149 */
+	195,  /* 150 */
+	196,  /* 151 */
+	196,  /* 152 */
+	197,  /* 153 */
+	198,  /* 154 */
+	198,  /* 155 */
+	199,  /* 156 */
+	200,  /* 157 */
+	200,  /* 158 */
+	201,  /* 159 */
+	201,  /* 160 */
+	202,  /* 161 */
+	203,  /* 162 */
+	203,  /* 163 */
+	204,  /* 164 */
+	205,  /* 165 */
+	205,  /* 166 */
+	206,  /* 167 */
+	206,  /* 168 */
+	207,  /* 169 */
+	208,  /* 170 */
+	208,  /* 171 */
+	209,  /* 172 */
+	210,  /* 173 */
+	210,  /* 174 */
+	211,  /* 175 */
+	211,  /* 176 */
+	212,  /* 177 */
+	213,  /* 178 */
+	213,  /* 179 */
+	214,  /* 180 */
+	214,  /* 181 */
+	215,  /* 182 */
+	216,  /* 183 */
+	216,  /* 184 */
+	217,  /* 185 */
+	217,  /* 186 */
+	218,  /* 187 */
+	218,  /* 188 */
+	219,  /* 189 */
+	220,  /* 190 */
+	220,  /* 191 */
+	221,  /* 192 */
+	221,  /* 193 */
+	222,  /* 194 */
+	222,  /* 195 */
+	223,  /* 196 */
+	224,  /* 197 */
+	224,  /* 198 */
+	225,  /* 199 */
+	225,  /* 200 */
+	226,  /* 201 */
+	226,  /* 202 */
+	227,  /* 203 */
+	228,  /* 204 */
+	228,  /* 205 */
+	229,  /* 206 */
+	229,  /* 207 */
+	230,  /* 208 */
+	230,  /* 209 */
+	231,  /* 210 */
+	231,  /* 211 */
+	232,  /* 212 */
+	233,  /* 213 */
+	233,  /* 214 */
+	234,  /* 215 */
+	234,  /* 216 */
+	235,  /* 217 */
+	235,  /* 218 */
+	236,  /* 219 */
+	236,  /* 220 */
+	237,  /* 221 */
+	237,  /* 222 */
+	238,  /* 223 */
+	238,  /* 224 */
+	239,  /* 225 */
+	240,  /* 226 */
+	240,  /* 227 */
+	241,  /* 228 */
+	241,  /* 229 */
+	242,  /* 230 */
+	242,  /* 231 */
+	243,  /* 232 */
+	243,  /* 233 */
+	244,  /* 234 */
+	244,  /* 235 */
+	245,  /* 236 */
+	245,  /* 237 */
+	246,  /* 238 */
+	246,  /* 239 */
+	247,  /* 240 */
+	247,  /* 241 */
+	248,  /* 242 */
+	248,  /* 243 */
+	249,  /* 244 */
+	249,  /* 245 */
+	250,  /* 246 */
+	250,  /* 247 */
+	251,  /* 248 */
+	251,  /* 249 */
+	252,  /* 250 */
+	252,  /* 251 */
+	253,  /* 252 */
+	253,  /* 253 */
+	254,  /* 254 */
+	255   /* 255 */
+};
+
+/*!
+\brief Internal blitter using adjusted destination alpha during RGBA->RGBA blits.
+
+Performs the blit based on the 'info' structure and applies the transfer function
+to the destination 'a' values.
+
+\param info The blit info to use.
+*/
+void _SDL_gfxBlitBlitterRGBA(SDL_gfxBlitInfo * info)
+{
+	int       width = info->d_width;
+	int       height = info->d_height;
+	Uint8    *src = info->s_pixels;
+	int       srcskip = info->s_skip;
+	Uint8    *dst = info->d_pixels;
+	int       dstskip = info->d_skip;
+	SDL_PixelFormat *srcfmt = info->src;
+	SDL_PixelFormat *dstfmt = info->dst;
+	Uint8       srcbpp = srcfmt->BytesPerPixel;
+	Uint8       dstbpp = dstfmt->BytesPerPixel;
+
+	while (height--) {
+		GFX_DUFFS_LOOP4( {
+			Uint32 pixel;
+			unsigned sR;
+			unsigned sG;
+			unsigned sB;
+			unsigned sA;
+			unsigned dR;
+			unsigned dG;
+			unsigned dB;
+			unsigned dA;
+			unsigned sAA;
+			GFX_DISASSEMBLE_RGBA(src, srcbpp, srcfmt, pixel, sR, sG, sB, sA);
+			GFX_DISASSEMBLE_RGBA(dst, dstbpp, dstfmt, pixel, dR, dG, dB, dA);
+			sAA=GFX_ALPHA_ADJUST_ARRAY[sA & 255];
+			GFX_ALPHA_BLEND(sR, sG, sB, sAA, dR, dG, dB);
+			dA |= sAA;
+			GFX_ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
+			src += srcbpp; dst += dstbpp;
+		}, width);
+		src += srcskip;
+		dst += dstskip;
+	}
+}
+
+/*!
+\brief Internal blitter setup wrapper for RGBA->RGBA blits.
+
+Sets up the blitter info based on the 'src' and 'dst' surfaces and rectangles.
+
+\param src The source surface.
+\param srcrect The source rectangle.
+\param dst The destination surface.
+\param dstrect The destination rectangle.
+
+\returns Returns 1 if blit was performed, 0 otherwise.
+*/
+int _SDL_gfxBlitRGBACall(SDL_Surface * src, SDL_Rect * srcrect, SDL_Surface * dst, SDL_Rect * dstrect)
+{
+	/*
+	* Set up source and destination buffer pointers, then blit 
+	*/
+	if (srcrect->w && srcrect->h) {
+		SDL_gfxBlitInfo info;
+
+		/*
+		* Set up the blit information 
+		*/
+#if (SDL_MINOR_VERSION == 3)
+		info.s_pixels = (Uint8 *) src->pixels               + (Uint16) srcrect->y * src->pitch + (Uint16) srcrect->x * src->format->BytesPerPixel;
+#else
+		info.s_pixels = (Uint8 *) src->pixels + src->offset + (Uint16) srcrect->y * src->pitch + (Uint16) srcrect->x * src->format->BytesPerPixel;
+#endif
+		info.s_width = srcrect->w;
+		info.s_height = srcrect->h;
+		info.s_skip = (int)(src->pitch - info.s_width * src->format->BytesPerPixel);
+#if (SDL_MINOR_VERSION == 3)
+		info.d_pixels = (Uint8 *) dst->pixels               + (Uint16) dstrect->y * dst->pitch + (Uint16) dstrect->x * dst->format->BytesPerPixel;
+#else
+		info.d_pixels = (Uint8 *) dst->pixels + dst->offset + (Uint16) dstrect->y * dst->pitch + (Uint16) dstrect->x * dst->format->BytesPerPixel;
+#endif
+		info.d_width = dstrect->w;
+		info.d_height = dstrect->h;
+		info.d_skip = (int)(dst->pitch - info.d_width * dst->format->BytesPerPixel);
+		info.aux_data = NULL;
+		info.src = src->format;
+		info.table = NULL;
+		info.dst = dst->format;
+
+		/*
+		* Run the actual software blitter 
+		*/
+		_SDL_gfxBlitBlitterRGBA(&info);
+		return 1;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Blitter for RGBA->RGBA blits with alpha adjustment.
+
+Verifies the input 'src' and 'dst' surfaces and rectangles and performs blit.
+The destination clip rectangle is honored.
+
+\param src The source surface.
+\param srcrect The source rectangle.
+\param dst The destination surface.
+\param dstrect The destination rectangle.
+
+\returns Returns 1 if blit was performed, 0 otherwise, or -1 if an error occured.
+*/
+int SDL_gfxBlitRGBA(SDL_Surface * src, SDL_Rect * srcrect, SDL_Surface * dst, SDL_Rect * dstrect)
+{
+	SDL_Rect  sr, dr;
+	int       srcx, srcy, w, h;
+
+	/*
+	* Make sure the surfaces aren't locked 
+	*/
+	if (!src || !dst) {
+		SDL_SetError("SDL_UpperBlit: passed a NULL surface");
+		return (-1);
+	}
+	if ((src->locked) || (dst->locked)) {
+		SDL_SetError("Surfaces must not be locked during blit");
+		return (-1);
+	}
+
+	/*
+	* If the destination rectangle is NULL, use the entire dest surface 
+	*/
+	if (dstrect == NULL) {
+		dr.x = dr.y = 0;
+		dr.w = dst->w;
+		dr.h = dst->h;
+	} else {
+		dr = *dstrect;
+	}
+
+	/*
+	* Clip the source rectangle to the source surface 
+	*/
+	if (srcrect) {
+		int       maxw, maxh;
+
+		srcx = srcrect->x;
+		w = srcrect->w;
+		if (srcx < 0) {
+			w += srcx;
+			dr.x -= srcx;
+			srcx = 0;
+		}
+		maxw = src->w - srcx;
+		if (maxw < w)
+			w = maxw;
+
+		srcy = srcrect->y;
+		h = srcrect->h;
+		if (srcy < 0) {
+			h += srcy;
+			dr.y -= srcy;
+			srcy = 0;
+		}
+		maxh = src->h - srcy;
+		if (maxh < h)
+			h = maxh;
+
+	} else {
+		srcx = srcy = 0;
+		w = src->w;
+		h = src->h;
+	}
+
+	/*
+	* Clip the destination rectangle against the clip rectangle 
+	*/
+	{
+		SDL_Rect *clip = &dst->clip_rect;
+		int       dx, dy;
+
+		dx = clip->x - dr.x;
+		if (dx > 0) {
+			w -= dx;
+			dr.x += dx;
+			srcx += dx;
+		}
+		dx = dr.x + w - clip->x - clip->w;
+		if (dx > 0)
+			w -= dx;
+
+		dy = clip->y - dr.y;
+		if (dy > 0) {
+			h -= dy;
+			dr.y += dy;
+			srcy += dy;
+		}
+		dy = dr.y + h - clip->y - clip->h;
+		if (dy > 0)
+			h -= dy;
+	}
+
+	if (w > 0 && h > 0) {
+		sr.x = srcx;
+		sr.y = srcy;
+		sr.w = dr.w = w;
+		sr.h = dr.h = h;
+		return (_SDL_gfxBlitRGBACall(src, &sr, dst, &dr));
+	}
+
+	return 0;
+}
+
+/*!
+\brief Sets the alpha channel in a 32 bit surface.
+
+Helper function that sets the alpha channel in a 32 bit surface
+to a constant value.
+Only 32 bit surfaces can be used with this function.
+
+\param src Pointer to the target surface to change.
+\param a The alpha value to set.
+
+\return Returns 1 if alpha was changed, -1 otherwise.
+*/
+int SDL_gfxSetAlpha(SDL_Surface *src, Uint8 a)
+{
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+	const int alpha_offset = 0;
+#else
+	const int alpha_offset = 3;
+#endif
+	int i, j, row_skip;
+	Uint8 *pixels;
+
+	/* Check if we have a 32bit surface */
+	if ( (src==NULL) || 
+		(src->format==NULL) || 
+		(src->format->BytesPerPixel!=4) ) {
+			SDL_SetError("SDL_gfxSetAlpha: Invalid input surface.");
+			return -1;
+	}
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(src)) {
+		if (SDL_LockSurface(src) < 0) {
+			return (-1);
+		}
+	}
+
+	/* Process */
+	pixels = (Uint8 *)src->pixels;
+	row_skip = (src->pitch - (4*src->w));
+	pixels += alpha_offset;
+	for ( i=0; i<src->h; i++ ) {
+		for ( j=0; j<src->w; j++  ) {
+			*pixels = a; 
+			pixels += 4;
+		}
+		pixels += row_skip;
+	}
+
+	/*
+	* Unlock surface 
+	*/
+	if (SDL_MUSTLOCK(src)) {
+		SDL_UnlockSurface(src);
+	}
+
+	return 1; 
+}
+
+/*!
+\brief Multiply the alpha channel in a 32bit surface.
+
+Helper function that multiplies the alpha channel in a 32 bit surface
+with a constant value. The final alpha is always scaled to the range 
+0-255 (i.e. the factor is a/256).
+
+Only 32 bit surfaces can be used with this function.
+
+\param src Pointer to the target surface to change.
+\param a The alpha value to multiply with. When a is 255, this function is a NoOp.
+
+\return Returns 1 if alpha was changed, 0 otherwise. Returns -1 if input surface is invalid.
+*/
+int SDL_gfxMultiplyAlpha(SDL_Surface *src, Uint8 a)
+{
+#if SDL_BYTEORDER == SDL_BIG_ENDIAN
+	const int alpha_offset = 0;
+#else
+	const int alpha_offset = 3;
+#endif
+	int i, j, row_skip;
+	Uint8 *pixels;
+
+	/* Check if we have a 32bit surface */
+	if ( (src==NULL) || 
+		(src->format==NULL) || 
+		(src->format->BytesPerPixel!=4) ) {
+			SDL_SetError("SDL_gfxMultiplyAlpha: Invalid input surface.");
+			return -1;
+	}
+
+	/* Check if multiplication is needed */
+	if (a==255) {
+		return 0;
+	}
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(src)) {
+		if (SDL_LockSurface(src) < 0) {
+			return (-1);
+		}
+	}
+
+	/* Process */
+	pixels = (Uint8 *)src->pixels;
+	row_skip = (src->pitch - (4*src->w));
+	pixels += alpha_offset;
+	for ( i=0; i<src->h; i++ ) {
+		for ( j=0; j<src->w; j++  ) {
+			*pixels = (Uint8)(((int)(*pixels)*a)>>8);
+			pixels += 4;
+		}
+		pixels += row_skip;
+	}
+
+	/*
+	* Unlock surface 
+	*/
+	if (SDL_MUSTLOCK(src)) {
+		SDL_UnlockSurface(src);
+	}
+
+	return 1;
+}
diff --git a/src/gfx/SDL_gfxPrimitives.c b/src/gfx/SDL_gfxPrimitives.c
new file mode 100644
index 0000000..ae8b998
--- /dev/null
+++ b/src/gfx/SDL_gfxPrimitives.c
@@ -0,0 +1,6851 @@
+/* 
+
+SDL_gfxPrimitives.c: graphics primitives for SDL surfaces
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include "SDL_gfxPrimitives.h"
+#include "SDL_rotozoom.h"
+#include "SDL_gfxPrimitives_font.h"
+#include "SDL_gfxBlitFunc.h"
+
+/* -===================- */
+
+#define DEFAULT_ALPHA_PIXEL_ROUTINE
+#undef EXPERIMENTAL_ALPHA_PIXEL_ROUTINE
+#define ALPHA_PIXEL_ADDITIVE_BLEND
+
+/* ---- Structures */
+
+/*!
+\brief The structure passed to the internal Bresenham iterator.
+*/
+typedef struct {
+	Sint16 x, y;
+	int dx, dy, s1, s2, swapdir, error;
+	Uint32 count;
+} SDL_gfxBresenhamIterator;
+
+/*!
+\brief The structure passed to the internal Murphy iterator.
+*/
+typedef struct {
+	Uint32 color;
+	SDL_Surface *dst;
+	int u, v;		/* delta x , delta y */
+	int ku, kt, kv, kd;	/* loop constants */
+	int oct2;
+	int quad4;
+	Sint16 last1x, last1y, last2x, last2y, first1x, first1y, first2x, first2y, tempx, tempy;
+} SDL_gfxMurphyIterator;
+
+/* ----- Defines for pixel clipping tests */
+
+#define clip_xmin(surface) surface->clip_rect.x
+#define clip_xmax(surface) surface->clip_rect.x+surface->clip_rect.w-1
+#define clip_ymin(surface) surface->clip_rect.y
+#define clip_ymax(surface) surface->clip_rect.y+surface->clip_rect.h-1
+
+/*!
+\brief Internal pixel drawing - fast, no blending, no locking, clipping.
+
+\param dst The surface to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param color The color value of the pixel to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int fastPixelColorNolock(SDL_Surface * dst, Sint16 x, Sint16 y, Uint32 color)
+{
+	int bpp;
+	Uint8 *p;
+
+	/*
+	* Honor clipping setup at pixel level 
+	*/
+	if ((x >= clip_xmin(dst)) && (x <= clip_xmax(dst)) && (y >= clip_ymin(dst)) && (y <= clip_ymax(dst))) {
+
+		/*
+		* Get destination format 
+		*/
+		bpp = dst->format->BytesPerPixel;
+		p = (Uint8 *) dst->pixels + y * dst->pitch + x * bpp;
+		switch (bpp) {
+		case 1:
+			*p = color;
+			break;
+		case 2:
+			*(Uint16 *) p = color;
+			break;
+		case 3:
+			if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+				p[0] = (color >> 16) & 0xff;
+				p[1] = (color >> 8) & 0xff;
+				p[2] = color & 0xff;
+			} else {
+				p[0] = color & 0xff;
+				p[1] = (color >> 8) & 0xff;
+				p[2] = (color >> 16) & 0xff;
+			}
+			break;
+		case 4:
+			*(Uint32 *) p = color;
+			break;
+		}			/* switch */
+
+
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal pixel drawing - fast, no blending, no locking, no clipping.
+
+Function is faster but dangerous since no clipping check is done.
+Code needs to make sure we stay in surface bounds before calling.
+
+\param dst The surface to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param color The color value of the pixel to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int fastPixelColorNolockNoclip(SDL_Surface * dst, Sint16 x, Sint16 y, Uint32 color)
+{
+	int bpp;
+	Uint8 *p;
+
+	/*
+	* Get destination format 
+	*/
+	bpp = dst->format->BytesPerPixel;
+	p = (Uint8 *) dst->pixels + y * dst->pitch + x * bpp;
+	switch (bpp) {
+	case 1:
+		*p = color;
+		break;
+	case 2:
+		*(Uint16 *) p = color;
+		break;
+	case 3:
+		if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+			p[0] = (color >> 16) & 0xff;
+			p[1] = (color >> 8) & 0xff;
+			p[2] = color & 0xff;
+		} else {
+			p[0] = color & 0xff;
+			p[1] = (color >> 8) & 0xff;
+			p[2] = (color >> 16) & 0xff;
+		}
+		break;
+	case 4:
+		*(Uint32 *) p = color;
+		break;
+	}				/* switch */
+
+	return (0);
+}
+
+/*!
+\brief Internal pixel drawing - fast, no blending, locking, clipping.
+
+\param dst The surface to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param color The color value of the pixel to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int fastPixelColor(SDL_Surface * dst, Sint16 x, Sint16 y, Uint32 color)
+{
+	int result;
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	result = fastPixelColorNolock(dst, x, y, color);
+
+	/*
+	* Unlock surface 
+	*/
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Internal pixel drawing - fast, no blending, locking, RGB input.
+
+\param dst The surface to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param r The red value of the pixel to draw. 
+\param g The green value of the pixel to draw. 
+\param b The blue value of the pixel to draw. 
+\param a The alpha value of the pixel to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int fastPixelRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Uint32 color;
+
+	/*
+	* Setup color 
+	*/
+	color = SDL_MapRGBA(dst->format, r, g, b, a);
+
+	/*
+	* Draw 
+	*/
+	return (fastPixelColor(dst, x, y, color));
+}
+
+/*!
+\brief Internal pixel drawing - fast, no blending, no locking RGB input.
+
+\param dst The surface to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param r The red value of the pixel to draw. 
+\param g The green value of the pixel to draw. 
+\param b The blue value of the pixel to draw. 
+\param a The alpha value of the pixel to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int fastPixelRGBANolock(SDL_Surface * dst, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Uint32 color;
+
+	/*
+	* Setup color 
+	*/
+	color = SDL_MapRGBA(dst->format, r, g, b, a);
+
+	/*
+	* Draw 
+	*/
+	return (fastPixelColorNolock(dst, x, y, color));
+}
+
+/*!
+\brief Internal pixel drawing function with alpha blending where input color in in destination format.
+
+Contains two alternative 32 bit alpha blending routines which can be enabled at the source
+level with the defines DEFAULT_ALPHA_PIXEL_ROUTINE or EXPERIMENTAL_ALPHA_PIXEL_ROUTINE.
+Only the bits up to the surface depth are significant in the color value.
+
+\param dst The surface to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param color The color value of the pixel to draw. 
+\param alpha The blend factor to apply while drawing.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _putPixelAlpha(SDL_Surface *dst, Sint16 x, Sint16 y, Uint32 color, Uint8 alpha)
+{
+	SDL_PixelFormat *format;
+	Uint32 Rmask, Gmask, Bmask, Amask;
+	Uint32 Rshift, Gshift, Bshift, Ashift;
+	Uint32 sR, sG, sB;
+	Uint32 dR, dG, dB, dA;
+
+	if (dst == NULL)
+	{
+		return (-1);
+	}
+
+	if (x >= clip_xmin(dst) && x <= clip_xmax(dst) && 
+		y >= clip_ymin(dst) && y <= clip_ymax(dst)) 
+	{
+
+		format = dst->format;
+
+		switch (format->BytesPerPixel) {
+		case 1:
+			{		/* Assuming 8-bpp */
+				Uint8 *pixel = (Uint8 *) dst->pixels + y * dst->pitch + x;
+				if (alpha == 255) {
+					*pixel = color;
+				} else {
+					Uint8 R, G, B;
+					SDL_Palette *palette = format->palette;
+					SDL_Color *colors = palette->colors;
+					SDL_Color dColor = colors[*pixel];
+					SDL_Color sColor = colors[color];
+					dR = dColor.r;
+					dG = dColor.g;
+					dB = dColor.b;
+					sR = sColor.r;
+					sG = sColor.g;
+					sB = sColor.b;
+
+					R = dR + ((sR - dR) * alpha >> 8);
+					G = dG + ((sG - dG) * alpha >> 8);
+					B = dB + ((sB - dB) * alpha >> 8);
+
+					*pixel = SDL_MapRGB(format, R, G, B);
+				}
+			}
+			break;
+
+		case 2:
+			{		/* Probably 15-bpp or 16-bpp */
+				Uint16 *pixel = (Uint16 *) dst->pixels + y * dst->pitch / 2 + x;
+				if (alpha == 255) {
+					*pixel = color;
+				} else {
+					Uint16 R, G, B, A;
+					Uint16 dc = *pixel;
+
+					Rmask = format->Rmask;
+					Gmask = format->Gmask;
+					Bmask = format->Bmask;
+					Amask = format->Amask;
+
+					dR = (dc & Rmask);
+					dG = (dc & Gmask);
+					dB = (dc & Bmask);
+
+					R = (dR + (((color & Rmask) - dR) * alpha >> 8)) & Rmask;
+					G = (dG + (((color & Gmask) - dG) * alpha >> 8)) & Gmask;
+					B = (dB + (((color & Bmask) - dB) * alpha >> 8)) & Bmask;
+					*pixel = R | G | B;
+					if (Amask!=0) {
+						dA = (dc & Amask);
+						A = (dA + (((color & Amask) - dA) * alpha >> 8)) & Amask;
+						*pixel |= A;
+					}
+				}
+			}
+			break;
+
+		case 3: 
+			{		/* Slow 24-bpp mode, usually not used */
+				Uint8 R, G, B;
+				Uint8 Rshift8, Gshift8, Bshift8;
+				Uint8 *pixel = (Uint8 *) dst->pixels + y * dst->pitch + x * 3;
+
+				Rshift = format->Rshift;
+				Gshift = format->Gshift;
+				Bshift = format->Bshift;
+
+				Rshift8 = Rshift >> 3;
+				Gshift8 = Gshift >> 3;
+				Bshift8 = Bshift >> 3;
+
+				sR = (color >> Rshift) & 0xFF;
+				sG = (color >> Gshift) & 0xFF;
+				sB = (color >> Bshift) & 0xFF;
+
+				if (alpha == 255) {
+					*(pixel + Rshift8) = sR;
+					*(pixel + Gshift8) = sG;
+					*(pixel + Bshift8) = sB;
+				} else {
+					dR = *((pixel) + Rshift8);
+					dG = *((pixel) + Gshift8);
+					dB = *((pixel) + Bshift8);
+
+					R = dR + ((sR - dR) * alpha >> 8);
+					G = dG + ((sG - dG) * alpha >> 8);
+					B = dB + ((sB - dB) * alpha >> 8);
+
+					*((pixel) + Rshift8) = R;
+					*((pixel) + Gshift8) = G;
+					*((pixel) + Bshift8) = B;
+				}
+			}
+			break;
+
+#ifdef DEFAULT_ALPHA_PIXEL_ROUTINE
+
+		case 4:
+			{		/* Probably :-) 32-bpp */
+				Uint32 R, G, B, A;
+				Uint32 *pixel = (Uint32 *) dst->pixels + y * dst->pitch / 4 + x;
+				if (alpha == 255) {
+					*pixel = color;
+				} else {
+					Uint32 dc = *pixel;
+
+					Rmask = format->Rmask;
+					Gmask = format->Gmask;
+					Bmask = format->Bmask;
+					Amask = format->Amask;
+
+					Rshift = format->Rshift;
+					Gshift = format->Gshift;
+					Bshift = format->Bshift;
+					Ashift = format->Ashift;
+
+					dR = (dc & Rmask) >> Rshift;
+					dG = (dc & Gmask) >> Gshift;
+					dB = (dc & Bmask) >> Bshift;
+
+
+					R = ((dR + ((((color & Rmask) >> Rshift) - dR) * alpha >> 8)) << Rshift) & Rmask;
+					G = ((dG + ((((color & Gmask) >> Gshift) - dG) * alpha >> 8)) << Gshift) & Gmask;
+					B = ((dB + ((((color & Bmask) >> Bshift) - dB) * alpha >> 8)) << Bshift) & Bmask;
+					*pixel = R | G | B;
+					if (Amask!=0) {
+						dA = (dc & Amask) >> Ashift;
+
+#ifdef ALPHA_PIXEL_ADDITIVE_BLEND
+						A = (dA | GFX_ALPHA_ADJUST_ARRAY[alpha & 255]) << Ashift; // make destination less transparent...
+#else
+						A = ((dA + ((((color & Amask) >> Ashift) - dA) * alpha >> 8)) << Ashift) & Amask;
+#endif
+						*pixel |= A;
+					}
+				}
+			}
+			break;
+#endif
+
+#ifdef EXPERIMENTAL_ALPHA_PIXEL_ROUTINE
+
+		case 4:{		/* Probably :-) 32-bpp */
+			if (alpha == 255) {
+				*((Uint32 *) dst->pixels + y * dst->pitch / 4 + x) = color;
+			} else {
+				Uint32 *pixel = (Uint32 *) dst->pixels + y * dst->pitch / 4 + x;
+				Uint32 dR, dG, dB, dA;
+				Uint32 dc = *pixel;
+
+				Uint32 surfaceAlpha, preMultR, preMultG, preMultB;
+				Uint32 aTmp;
+
+				Rmask = format->Rmask;
+				Gmask = format->Gmask;
+				Bmask = format->Bmask;
+				Amask = format->Amask;
+
+				dR = (color & Rmask);
+				dG = (color & Gmask);
+				dB = (color & Bmask);
+				dA = (color & Amask);
+
+				Rshift = format->Rshift;
+				Gshift = format->Gshift;
+				Bshift = format->Bshift;
+				Ashift = format->Ashift;
+
+				preMultR = (alpha * (dR >> Rshift));
+				preMultG = (alpha * (dG >> Gshift));
+				preMultB = (alpha * (dB >> Bshift));
+
+				surfaceAlpha = ((dc & Amask) >> Ashift);
+				aTmp = (255 - alpha);
+				if (A = 255 - ((aTmp * (255 - surfaceAlpha)) >> 8 )) {
+					aTmp *= surfaceAlpha;
+					R = (preMultR + ((aTmp * ((dc & Rmask) >> Rshift)) >> 8)) / A << Rshift & Rmask;
+					G = (preMultG + ((aTmp * ((dc & Gmask) >> Gshift)) >> 8)) / A << Gshift & Gmask;
+					B = (preMultB + ((aTmp * ((dc & Bmask) >> Bshift)) >> 8)) / A << Bshift & Bmask;
+				}
+				*pixel = R | G | B | (A << Ashift & Amask);
+
+			}
+			   }
+			   break;
+#endif
+		}
+	}
+
+	return (0);
+}
+
+/*!
+\brief Pixel draw with blending enabled if a<255.
+
+\param dst The surface to draw on.
+\param x X (horizontal) coordinate of the pixel.
+\param y Y (vertical) coordinate of the pixel.
+\param color The color value of the pixel to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pixelColor(SDL_Surface * dst, Sint16 x, Sint16 y, Uint32 color)
+{
+	Uint8 alpha;
+	Uint32 mcolor;
+	int result = 0;
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	/*
+	* Setup color 
+	*/
+	alpha = color & 0x000000ff;
+	mcolor =
+		SDL_MapRGBA(dst->format, (color & 0xff000000) >> 24,
+		(color & 0x00ff0000) >> 16, (color & 0x0000ff00) >> 8, alpha);
+
+	/*
+	* Draw 
+	*/
+	result = _putPixelAlpha(dst, x, y, mcolor, alpha);
+
+	/*
+	* Unlock the surface 
+	*/
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Pixel draw with blending enabled if a<255 - no surface locking.
+
+\param dst The surface to draw on.
+\param x X (horizontal) coordinate of the pixel.
+\param y Y (vertical) coordinate of the pixel.
+\param color The color value of the pixel to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pixelColorNolock(SDL_Surface * dst, Sint16 x, Sint16 y, Uint32 color)
+{
+	Uint8 alpha;
+	Uint32 mcolor;
+	int result = 0;
+
+	/*
+	* Setup color 
+	*/
+	alpha = color & 0x000000ff;
+	mcolor =
+		SDL_MapRGBA(dst->format, (color & 0xff000000) >> 24,
+		(color & 0x00ff0000) >> 16, (color & 0x0000ff00) >> 8, alpha);
+
+	/*
+	* Draw 
+	*/
+	result = _putPixelAlpha(dst, x, y, mcolor, alpha);
+
+	return (result);
+}
+
+
+/*!
+\brief Internal function to draw filled rectangle with alpha blending.
+
+Assumes color is in destination format.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first corner (upper left) of the rectangle.
+\param y1 Y coordinate of the first corner (upper left) of the rectangle.
+\param x2 X coordinate of the second corner (lower right) of the rectangle.
+\param y2 Y coordinate of the second corner (lower right) of the rectangle.
+\param color The color value of the rectangle to draw (0xRRGGBBAA). 
+\param alpha Alpha blending amount for pixels.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _filledRectAlpha(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color, Uint8 alpha)
+{
+	SDL_PixelFormat *format;
+	Uint32 Rmask, Gmask, Bmask, Amask;
+	Uint32 Rshift, Gshift, Bshift, Ashift;
+	Uint32 sR, sG, sB, sA;
+	Uint32 dR, dG, dB, dA;
+	Sint16 x, y;
+
+	if (dst == NULL) {
+		return (-1);
+	}
+
+	format = dst->format;
+	switch (format->BytesPerPixel) {
+	case 1:
+		{			/* Assuming 8-bpp */
+			Uint8 *row, *pixel;
+			Uint8 R, G, B;
+			SDL_Color *colors = format->palette->colors;
+			SDL_Color dColor;
+			SDL_Color sColor = colors[color];
+			sR = sColor.r;
+			sG = sColor.g;
+			sB = sColor.b;
+
+			for (y = y1; y <= y2; y++) {
+				row = (Uint8 *) dst->pixels + y * dst->pitch;
+				for (x = x1; x <= x2; x++) {
+					if (alpha == 255) {
+						*(row + x) = color;
+					} else {
+						pixel = row + x;
+
+						dColor = colors[*pixel];
+						dR = dColor.r;
+						dG = dColor.g;
+						dB = dColor.b;
+
+						R = dR + ((sR - dR) * alpha >> 8);
+						G = dG + ((sG - dG) * alpha >> 8);
+						B = dB + ((sB - dB) * alpha >> 8);
+
+						*pixel = SDL_MapRGB(format, R, G, B);
+					}
+				}
+			}
+		}
+		break;
+
+	case 2:
+		{			/* Probably 15-bpp or 16-bpp */
+			Uint16 *row, *pixel;
+			Uint16 R, G, B, A;
+			Uint16 dc;
+			Rmask = format->Rmask;
+			Gmask = format->Gmask;
+			Bmask = format->Bmask;
+			Amask = format->Amask;
+
+			sR = (color & Rmask); 
+			sG = (color & Gmask);
+			sB = (color & Bmask);
+			sA = (color & Amask);
+
+			for (y = y1; y <= y2; y++) {
+				row = (Uint16 *) dst->pixels + y * dst->pitch / 2;
+				for (x = x1; x <= x2; x++) {
+					if (alpha == 255) {
+						*(row + x) = color;
+					} else {
+						pixel = row + x;
+						dc = *pixel;
+
+						dR = (dc & Rmask);
+						dG = (dc & Gmask);
+						dB = (dc & Bmask);
+
+						R = (dR + ((sR - dR) * alpha >> 8)) & Rmask;
+						G = (dG + ((sG - dG) * alpha >> 8)) & Gmask;
+						B = (dB + ((sB - dB) * alpha >> 8)) & Bmask;
+						*pixel = R | G | B;
+						if (Amask!=0) {
+							dA = (dc & Amask);
+							A = (dA + ((sA - dA) * alpha >> 8)) & Amask;
+							*pixel |= A;
+						} 
+					}
+				}
+			}
+		}
+		break;
+
+	case 3:
+		{			/* Slow 24-bpp mode, usually not used */
+			Uint8 *row, *pixel;
+			Uint8 R, G, B;
+			Uint8 Rshift8, Gshift8, Bshift8;
+
+			Rshift = format->Rshift;
+			Gshift = format->Gshift;
+			Bshift = format->Bshift;
+
+			Rshift8 = Rshift >> 3;
+			Gshift8 = Gshift >> 3;
+			Bshift8 = Bshift >> 3;
+
+			sR = (color >> Rshift) & 0xff;
+			sG = (color >> Gshift) & 0xff;
+			sB = (color >> Bshift) & 0xff;
+
+			for (y = y1; y <= y2; y++) {
+				row = (Uint8 *) dst->pixels + y * dst->pitch;
+				for (x = x1; x <= x2; x++) {
+					pixel = row + x * 3;
+
+					if (alpha == 255) {
+						*(pixel + Rshift8) = sR;
+						*(pixel + Gshift8) = sG;
+						*(pixel + Bshift8) = sB;
+					} else {
+						dR = *((pixel) + Rshift8);
+						dG = *((pixel) + Gshift8);
+						dB = *((pixel) + Bshift8);
+
+						R = dR + ((sR - dR) * alpha >> 8);
+						G = dG + ((sG - dG) * alpha >> 8);
+						B = dB + ((sB - dB) * alpha >> 8);
+
+						*((pixel) + Rshift8) = R;
+						*((pixel) + Gshift8) = G;
+						*((pixel) + Bshift8) = B;
+					}
+				}
+			}
+		}
+		break;
+
+#ifdef DEFAULT_ALPHA_PIXEL_ROUTINE
+	case 4:
+		{			/* Probably :-) 32-bpp */
+			Uint32 *row, *pixel;
+			Uint32 R, G, B, A;
+			Uint32 dc;
+			Rmask = format->Rmask;
+			Gmask = format->Gmask;
+			Bmask = format->Bmask;
+			Amask = format->Amask;
+
+			Rshift = format->Rshift;
+			Gshift = format->Gshift;
+			Bshift = format->Bshift;
+			Ashift = format->Ashift;
+
+			sR = (color & Rmask) >> Rshift;
+			sG = (color & Gmask) >> Gshift;
+			sB = (color & Bmask) >> Bshift;
+			sA = (color & Amask) >> Ashift;
+
+			for (y = y1; y <= y2; y++) {
+				row = (Uint32 *) dst->pixels + y * dst->pitch / 4;
+				for (x = x1; x <= x2; x++) {
+					if (alpha == 255) {
+						*(row + x) = color;
+					} else {
+						pixel = row + x;
+						dc = *pixel;
+
+						dR = (dc & Rmask) >> Rshift;
+						dG = (dc & Gmask) >> Gshift;
+						dB = (dc & Bmask) >> Bshift;
+
+						R = ((dR + ((sR - dR) * alpha >> 8)) << Rshift) & Rmask;
+						G = ((dG + ((sG - dG) * alpha >> 8)) << Gshift) & Gmask;
+						B = ((dB + ((sB - dB) * alpha >> 8)) << Bshift) & Bmask;
+						*pixel = R | G | B;
+						if (Amask!=0) {
+							dA = (dc & Amask) >> Ashift;
+#ifdef ALPHA_PIXEL_ADDITIVE_BLEND
+							A = (dA | GFX_ALPHA_ADJUST_ARRAY[sA & 255]) << Ashift; // make destination less transparent...
+#else
+							A = ((dA + ((sA - dA) * alpha >> 8)) << Ashift) & Amask;
+#endif
+							*pixel |= A;
+						}
+					}
+				}
+			}
+		}
+		break;
+#endif
+
+#ifdef EXPERIMENTAL_ALPHA_PIXEL_ROUTINE
+	case 4:{			/* Probably :-) 32-bpp */
+		Uint32 *row, *pixel;
+		Uint32 dR, dG, dB, dA;
+		Uint32 dc;
+		Uint32 surfaceAlpha, preMultR, preMultG, preMultB;
+		Uint32 aTmp;
+
+		Rmask = format->Rmask;
+		Gmask = format->Gmask;
+		Bmask = format->Bmask;
+		Amask = format->Amask;
+
+		dR = (color & Rmask);
+		dG = (color & Gmask);
+		dB = (color & Bmask);
+		dA = (color & Amask);
+
+		Rshift = format->Rshift;
+		Gshift = format->Gshift;
+		Bshift = format->Bshift;
+		Ashift = format->Ashift;
+
+		preMultR = (alpha * (dR >> Rshift));
+		preMultG = (alpha * (dG >> Gshift));
+		preMultB = (alpha * (dB >> Bshift));
+
+		for (y = y1; y <= y2; y++) {
+			row = (Uint32 *) dst->pixels + y * dst->pitch / 4;
+			for (x = x1; x <= x2; x++) {
+				if (alpha == 255) {
+					*(row + x) = color;
+				} else {
+					pixel = row + x;
+					dc = *pixel;
+
+					surfaceAlpha = ((dc & Amask) >> Ashift);
+					aTmp = (255 - alpha);
+					if (A = 255 - ((aTmp * (255 - surfaceAlpha)) >> 8 )) {
+						aTmp *= surfaceAlpha;
+						R = (preMultR + ((aTmp * ((dc & Rmask) >> Rshift)) >> 8)) / A << Rshift & Rmask;
+						G = (preMultG + ((aTmp * ((dc & Gmask) >> Gshift)) >> 8)) / A << Gshift & Gmask;
+						B = (preMultB + ((aTmp * ((dc & Bmask) >> Bshift)) >> 8)) / A << Bshift & Bmask;
+					}
+					*pixel = R | G | B | (A << Ashift & Amask);
+				}
+			}
+		}
+		   }
+		   break;
+#endif
+
+	}
+
+	return (0);
+}
+
+/*!
+\brief Draw filled rectangle of RGBA color with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first corner (upper left) of the rectangle.
+\param y1 Y coordinate of the first corner (upper left) of the rectangle.
+\param x2 X coordinate of the second corner (lower right) of the rectangle.
+\param y2 Y coordinate of the second corner (lower right) of the rectangle.
+\param color The color value of the rectangle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledRectAlpha(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Uint8 alpha;
+	Uint32 mcolor;
+	int result = 0;
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	/*
+	* Setup color 
+	*/
+	alpha = color & 0x000000ff;
+	mcolor =
+		SDL_MapRGBA(dst->format, (color & 0xff000000) >> 24,
+		(color & 0x00ff0000) >> 16, (color & 0x0000ff00) >> 8, alpha);
+
+	/*
+	* Draw 
+	*/
+	result = _filledRectAlpha(dst, x1, y1, x2, y2, mcolor, alpha);
+
+	/*
+	* Unlock the surface 
+	*/
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Internal function to draw horizontal line of RGBA color with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _HLineAlpha(SDL_Surface * dst, Sint16 x1, Sint16 x2, Sint16 y, Uint32 color)
+{
+	return (filledRectAlpha(dst, x1, y, x2, y, color));
+}
+
+/*!
+\brief Internal function to draw vertical line of RGBA color with alpha blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the points of the line.
+\param y1 Y coordinate of the first point (top) of the line.
+\param y2 Y coordinate of the second point (bottom) of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _VLineAlpha(SDL_Surface * dst, Sint16 x, Sint16 y1, Sint16 y2, Uint32 color)
+{
+	return (filledRectAlpha(dst, x, y1, x, y2, color));
+}
+
+/*!
+\brief Pixel draw with blending enabled and using alpha weight on color.
+
+\param dst The surface to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param color The color value of the pixel to draw (0xRRGGBBAA). 
+\param weight The weight multiplied into the alpha value of the pixel.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pixelColorWeight(SDL_Surface * dst, Sint16 x, Sint16 y, Uint32 color, Uint32 weight)
+{
+	Uint32 a;
+
+	/*
+	* Get alpha 
+	*/
+	a = (color & (Uint32) 0x000000ff);
+
+	/*
+	* Modify Alpha by weight 
+	*/
+	a = ((a * weight) >> 8);
+
+	return (pixelColor(dst, x, y, (color & (Uint32) 0xffffff00) | (Uint32) a));
+}
+
+/*!
+\brief Pixel draw with blending enabled and using alpha weight on color - no locking.
+
+\param dst The surface to draw on.
+\param x The horizontal coordinate of the pixel.
+\param y The vertical position of the pixel.
+\param color The color value of the pixel to draw (0xRRGGBBAA). 
+\param weight The weight multiplied into the alpha value of the pixel.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pixelColorWeightNolock(SDL_Surface * dst, Sint16 x, Sint16 y, Uint32 color, Uint32 weight)
+{
+	Uint32 a;
+
+	/*
+	* Get alpha 
+	*/
+	a = (color & (Uint32) 0x000000ff);
+
+	/*
+	* Modify Alpha by weight 
+	*/
+	a = ((a * weight) >> 8);
+
+	return (pixelColorNolock(dst, x, y, (color & (Uint32) 0xffffff00) | (Uint32) a));
+}
+
+/*!
+\brief Pixel draw with blending enabled if a<255.
+
+\param dst The surface to draw on.
+\param x X (horizontal) coordinate of the pixel.
+\param y Y (vertical) coordinate of the pixel.
+\param r The red color value of the pixel to draw. 
+\param g The green color value of the pixel to draw.
+\param b The blue color value of the pixel to draw.
+\param a The alpha value of the pixel to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pixelRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Uint32 color;
+
+	/*
+	* Check Alpha 
+	*/
+	if (a == 255) {
+		/*
+		* No alpha blending required 
+		*/
+		/*
+		* Setup color 
+		*/
+		color = SDL_MapRGBA(dst->format, r, g, b, a);
+		/*
+		* Draw 
+		*/
+		return (fastPixelColor(dst, x, y, color));
+	} else {
+		/*
+		* Alpha blending required 
+		*/
+		/*
+		* Draw 
+		*/
+		return (pixelColor(dst, x, y, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+	}
+}
+
+
+/*!
+\brief Draw horizontal line without blending;
+
+Just stores the color value (including the alpha component) without blending.
+Only the same number of bits of the destination surface are transfered
+from the input color value.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param color The color value of the line to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int hlineColorStore(SDL_Surface * dst, Sint16 x1, Sint16 x2, Sint16 y, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	Uint8 *pixel, *pixellast;
+	int dx;
+	int pixx, pixy;
+	Sint16 w;
+	Sint16 xtmp;
+	int result = -1;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Swap x1, x2 if required to ensure x1<=x2
+	*/
+	if (x1 > x2) {
+		xtmp = x1;
+		x1 = x2;
+		x2 = xtmp;
+	}
+
+	/*
+	* Get clipping boundary and
+	* check visibility of hline 
+	*/
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	}
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	}
+	top = dst->clip_rect.y;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if ((y<top) || (y>bottom)) {
+		return (0);
+	}
+
+	/*
+	* Clip x 
+	*/
+	if (x1 < left) {
+		x1 = left;
+	}
+	if (x2 > right) {
+		x2 = right;
+	}
+
+	/*
+	* Calculate width 
+	*/
+	w = x2 - x1;
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	/*
+	* More variable setup 
+	*/
+	dx = w;
+	pixx = dst->format->BytesPerPixel;
+	pixy = dst->pitch;
+	pixel = ((Uint8 *) dst->pixels) + pixx * (int) x1 + pixy * (int) y;
+
+	/*
+	* Draw 
+	*/
+	switch (dst->format->BytesPerPixel) {
+	case 1:
+		memset(pixel, color, dx+1);
+		break;
+	case 2:
+		pixellast = pixel + dx + dx;
+		for (; pixel <= pixellast; pixel += pixx) {
+			*(Uint16 *) pixel = color;
+		}
+		break;
+	case 3:
+		pixellast = pixel + dx + dx + dx;
+		for (; pixel <= pixellast; pixel += pixx) {
+			if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+				pixel[0] = (color >> 16) & 0xff;
+				pixel[1] = (color >> 8) & 0xff;
+				pixel[2] = color & 0xff;
+			} else {
+				pixel[0] = color & 0xff;
+				pixel[1] = (color >> 8) & 0xff;
+				pixel[2] = (color >> 16) & 0xff;
+			}
+		}
+		break;
+	default:		/* case 4 */
+		dx = dx + dx;
+		pixellast = pixel + dx + dx;
+		for (; pixel <= pixellast; pixel += pixx) {
+			*(Uint32 *) pixel = color;
+		}
+		break;
+	}
+
+	/* 
+	* Unlock surface 
+	*/
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	/*
+	* Set result code 
+	*/
+	result = 0;
+
+	return (result);
+}
+
+/*!
+\brief Draw horizontal line without blending
+
+Just stores the color value (including the alpha component) without blending.
+Function should only be used for 32 bit target surfaces.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int hlineRGBAStore(SDL_Surface * dst, Sint16 x1, Sint16 x2, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (hlineColorStore(dst, x1, x2, y, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/*!
+\brief Draw horizontal line with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int hlineColor(SDL_Surface * dst, Sint16 x1, Sint16 x2, Sint16 y, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	Uint8 *pixel, *pixellast;
+	int dx;
+	int pixx, pixy;
+	Sint16 xtmp;
+	int result = -1;
+	Uint8 *colorptr;
+	Uint8 color3[3];
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Swap x1, x2 if required to ensure x1<=x2
+	*/
+	if (x1 > x2) {
+		xtmp = x1;
+		x1 = x2;
+		x2 = xtmp;
+	}
+
+	/*
+	* Get clipping boundary and
+	* check visibility of hline 
+	*/
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	}
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	}
+	top = dst->clip_rect.y;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if ((y<top) || (y>bottom)) {
+		return (0);
+	}
+
+	/*
+	* Clip x 
+	*/
+	if (x1 < left) {
+		x1 = left;
+	}
+	if (x2 > right) {
+		x2 = right;
+	}
+
+	/*
+	* Calculate width difference
+	*/
+	dx = x2 - x1;
+
+	/*
+	* Alpha check 
+	*/
+	if ((color & 255) == 255) {
+
+		/*
+		* No alpha-blending required 
+		*/
+
+		/*
+		* Setup color 
+		*/
+		colorptr = (Uint8 *) & color;
+		if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+			color = SDL_MapRGBA(dst->format, colorptr[0], colorptr[1], colorptr[2], colorptr[3]);
+		} else {
+			color = SDL_MapRGBA(dst->format, colorptr[3], colorptr[2], colorptr[1], colorptr[0]);
+		}
+
+		/*
+		* Lock the surface 
+		*/
+		if (SDL_MUSTLOCK(dst)) {
+			if (SDL_LockSurface(dst) < 0) {
+				return (-1);
+			}
+		}
+
+		/*
+		* More variable setup 
+		*/
+		pixx = dst->format->BytesPerPixel;
+		pixy = dst->pitch;
+		pixel = ((Uint8 *) dst->pixels) + pixx * (int) x1 + pixy * (int) y;
+
+		/*
+		* Draw 
+		*/
+		switch (dst->format->BytesPerPixel) {
+		case 1:
+			memset(pixel, color, dx + 1);
+			break;
+		case 2:
+			pixellast = pixel + dx + dx;
+			for (; pixel <= pixellast; pixel += pixx) {
+				*(Uint16 *) pixel = color;
+			}
+			break;
+		case 3:
+			pixellast = pixel + dx + dx + dx;
+			if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+				color3[0] = (color >> 16) & 0xff;
+				color3[1] = (color >> 8) & 0xff;
+				color3[2] = color & 0xff;
+			} else {
+				color3[0] = color & 0xff;
+				color3[1] = (color >> 8) & 0xff;
+				color3[2] = (color >> 16) & 0xff;
+			}
+			for (; pixel <= pixellast; pixel += pixx) {
+				memcpy(pixel, color3, 3);
+			}
+			break;
+		default:		/* case 4 */
+			dx = dx + dx;
+			pixellast = pixel + dx + dx;
+			for (; pixel <= pixellast; pixel += pixx) {
+				*(Uint32 *) pixel = color;
+			}
+			break;
+		}
+
+		/* 
+		* Unlock surface 
+		*/
+		if (SDL_MUSTLOCK(dst)) {
+			SDL_UnlockSurface(dst);
+		}
+
+		/*
+		* Set result code 
+		*/
+		result = 0;
+
+	} else {
+
+		/*
+		* Alpha blending blit 
+		*/
+		result = _HLineAlpha(dst, x1, x1 + dx, y, color);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw horizontal line with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int hlineRGBA(SDL_Surface * dst, Sint16 x1, Sint16 x2, Sint16 y, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (hlineColor(dst, x1, x2, y, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/*!
+\brief Draw vertical line with blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the points of the line.
+\param y1 Y coordinate of the first point (i.e. top) of the line.
+\param y2 Y coordinate of the second point (i.e. bottom) of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int vlineColor(SDL_Surface * dst, Sint16 x, Sint16 y1, Sint16 y2, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	Uint8 *pixel, *pixellast;
+	int dy;
+	int pixx, pixy;
+	Sint16 h;
+	Sint16 ytmp;
+	int result = -1;
+	Uint8 *colorptr;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Swap y1, y2 if required to ensure y1<=y2
+	*/
+	if (y1 > y2) {
+		ytmp = y1;
+		y1 = y2;
+		y2 = ytmp;
+	}
+
+	/*
+	* Get clipping boundary and
+	* check visibility of vline 
+	*/
+	left = dst->clip_rect.x;
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if ((x<left) || (x>right)) {
+		return (0);
+	}    
+	top = dst->clip_rect.y;
+	if (y2<top) {
+		return(0);
+	}
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if (y1>bottom) {
+		return(0);
+	}
+
+	/*
+	* Clip x 
+	*/
+	if (y1 < top) {
+		y1 = top;
+	}
+	if (y2 > bottom) {
+		y2 = bottom;
+	}
+
+	/*
+	* Calculate height
+	*/
+	h = y2 - y1;
+
+	/*
+	* Alpha check 
+	*/
+	if ((color & 255) == 255) {
+
+		/*
+		* No alpha-blending required 
+		*/
+
+		/*
+		* Setup color 
+		*/
+		colorptr = (Uint8 *) & color;
+		if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+			color = SDL_MapRGBA(dst->format, colorptr[0], colorptr[1], colorptr[2], colorptr[3]);
+		} else {
+			color = SDL_MapRGBA(dst->format, colorptr[3], colorptr[2], colorptr[1], colorptr[0]);
+		}
+
+		/*
+		* Lock the surface 
+		*/
+		if (SDL_MUSTLOCK(dst)) {
+			if (SDL_LockSurface(dst) < 0) {
+				return (-1);
+			}
+		}
+
+		/*
+		* More variable setup 
+		*/
+		dy = h;
+		pixx = dst->format->BytesPerPixel;
+		pixy = dst->pitch;
+		pixel = ((Uint8 *) dst->pixels) + pixx * (int) x + pixy * (int) y1;
+		pixellast = pixel + pixy * dy;
+
+		/*
+		* Draw 
+		*/
+		switch (dst->format->BytesPerPixel) {
+		case 1:
+			for (; pixel <= pixellast; pixel += pixy) {
+				*(Uint8 *) pixel = color;
+			}
+			break;
+		case 2:
+			for (; pixel <= pixellast; pixel += pixy) {
+				*(Uint16 *) pixel = color;
+			}
+			break;
+		case 3:
+			for (; pixel <= pixellast; pixel += pixy) {
+				if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+					pixel[0] = (color >> 16) & 0xff;
+					pixel[1] = (color >> 8) & 0xff;
+					pixel[2] = color & 0xff;
+				} else {
+					pixel[0] = color & 0xff;
+					pixel[1] = (color >> 8) & 0xff;
+					pixel[2] = (color >> 16) & 0xff;
+				}
+			}
+			break;
+		default:		/* case 4 */
+			for (; pixel <= pixellast; pixel += pixy) {
+				*(Uint32 *) pixel = color;
+			}
+			break;
+		}
+
+		/* Unlock surface */
+		if (SDL_MUSTLOCK(dst)) {
+			SDL_UnlockSurface(dst);
+		}
+
+		/*
+		* Set result code 
+		*/
+		result = 0;
+
+	} else {
+
+		/*
+		* Alpha blending blit 
+		*/
+
+		result = _VLineAlpha(dst, x, y1, y1 + h, color);
+
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw vertical line with blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the points of the line.
+\param y1 Y coordinate of the first point (i.e. top) of the line.
+\param y2 Y coordinate of the second point (i.e. bottom) of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int vlineRGBA(SDL_Surface * dst, Sint16 x, Sint16 y1, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (vlineColor(dst, x, y1, y2, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/*!
+\brief Draw rectangle with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param color The color value of the rectangle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int rectangleColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	int result;
+	Sint16 tmp;
+
+	/* Check destination surface */
+	if (dst == NULL)
+	{
+		return -1;
+	}
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return 0;
+	}
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelColor(dst, x1, y1, color));
+		} else {
+			return (vlineColor(dst, x1, y1, y2, color));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineColor(dst, x1, x2, y1, color));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/*
+	* Draw rectangle 
+	*/
+	result = 0;
+	result |= hlineColor(dst, x1, x2, y1, color);
+	result |= hlineColor(dst, x1, x2, y2, color);
+	y1 += 1;
+	y2 -= 1;
+	if (y1 <= y2) {
+		result |= vlineColor(dst, x1, y1, y2, color);
+		result |= vlineColor(dst, x2, y1, y2, color);
+	}
+
+	return (result);
+
+}
+
+/*!
+\brief Draw rectangle with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param r The red value of the rectangle to draw. 
+\param g The green value of the rectangle to draw. 
+\param b The blue value of the rectangle to draw. 
+\param a The alpha value of the rectangle to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int rectangleRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (rectangleColor
+		(dst, x1, y1, x2, y2, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/*!
+\brief Draw rounded-corner rectangle with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param rad The radius of the corner arc.
+\param color The color value of the rectangle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int roundedRectangleColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color)
+{
+	int result;
+	Sint16 w, h, tmp;
+	Sint16 xx1, xx2, yy1, yy2;
+
+	/* 
+	* Check destination surface 
+	*/
+	if (dst == NULL)
+	{
+		return -1;
+	}
+
+	/*
+	* Check radius vor valid range
+	*/
+	if (rad < 0) {
+		return -1;
+	}
+
+	/*
+	* Special case - no rounding
+	*/
+	if (rad == 0) {
+		return rectangleColor(dst, x1, y1, x2, y2, color);
+	}
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return 0;
+	}
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelColor(dst, x1, y1, color));
+		} else {
+			return (vlineColor(dst, x1, y1, y2, color));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineColor(dst, x1, x2, y1, color));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/*
+	* Calculate width&height 
+	*/
+	w = x2 - x1;
+	h = y2 - y1;
+
+	/*
+	* Maybe adjust radius
+	*/
+	if ((rad * 2) > w)  
+	{
+		rad = w / 2;
+	}
+	if ((rad * 2) > h)
+	{
+		rad = h / 2;
+	}
+
+	/*
+	* Draw corners
+	*/
+	result = 0;
+	xx1 = x1 + rad;
+	xx2 = x2 - rad;
+	yy1 = y1 + rad;
+	yy2 = y2 - rad;
+	result |= arcColor(dst, xx1, yy1, rad, 180, 270, color);
+	result |= arcColor(dst, xx2, yy1, rad, 270, 360, color);
+	result |= arcColor(dst, xx1, yy2, rad,  90, 180, color);
+	result |= arcColor(dst, xx2, yy2, rad,   0,  90, color);
+
+	/*
+	* Draw lines
+	*/
+	if (xx1 <= xx2) {
+		result |= hlineColor(dst, xx1, xx2, y1, color);
+		result |= hlineColor(dst, xx1, xx2, y2, color);
+	}
+	if (yy1 <= yy2) {
+		result |= vlineColor(dst, x1, yy1, yy2, color);
+		result |= vlineColor(dst, x2, yy1, yy2, color);
+	}
+
+	return result;
+}
+
+/*!
+\brief Draw rounded-corner rectangle with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the rectangle.
+\param y1 Y coordinate of the first point (i.e. top right) of the rectangle.
+\param x2 X coordinate of the second point (i.e. bottom left) of the rectangle.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the rectangle.
+\param rad The radius of the corner arc.
+\param r The red value of the rectangle to draw. 
+\param g The green value of the rectangle to draw. 
+\param b The blue value of the rectangle to draw. 
+\param a The alpha value of the rectangle to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int roundedRectangleRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (roundedRectangleColor
+		(dst, x1, y1, x2, y2, rad, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/*!
+\brief Draw rounded-corner box (filled rectangle) with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param rad The radius of the corner arcs of the box.
+\param color The color value of the box to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int roundedBoxColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 rad, Uint32 color)
+{
+	int result;
+	Sint16 w, h, tmp;
+	Sint16 xx1, xx2, yy1, yy2;
+
+	/* 
+	* Check destination surface 
+	*/
+	if (dst == NULL)
+	{
+		return -1;
+	}
+
+	/*
+	* Check radius vor valid range
+	*/
+	if (rad < 0) {
+		return -1;
+	}
+
+	/*
+	* Special case - no rounding
+	*/
+	if (rad == 0) {
+		return rectangleColor(dst, x1, y1, x2, y2, color);
+	}
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return 0;
+	}
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelColor(dst, x1, y1, color));
+		} else {
+			return (vlineColor(dst, x1, y1, y2, color));
+		}
+	} else {
+		if (y1 == y2) {
+			return (hlineColor(dst, x1, x2, y1, color));
+		}
+	}
+
+	/*
+	* Swap x1, x2 if required 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+
+	/*
+	* Swap y1, y2 if required 
+	*/
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/*
+	* Calculate width&height 
+	*/
+	w = x2 - x1;
+	h = y2 - y1;
+
+	/*
+	* Maybe adjust radius
+	*/
+	if ((rad * 2) > w)  
+	{
+		rad = w / 2;
+	}
+	if ((rad * 2) > h)
+	{
+		rad = h / 2;
+	}
+
+	/*
+	* Draw corners
+	*/
+	result = 0;
+	xx1 = x1 + rad;
+	xx2 = x2 - rad;
+	yy1 = y1 + rad;
+	yy2 = y2 - rad;
+	result |= filledPieColor(dst, xx1, yy1, rad, 180, 270, color);
+	result |= filledPieColor(dst, xx2, yy1, rad, 270, 360, color);
+	result |= filledPieColor(dst, xx1, yy2, rad,  90, 180, color);
+	result |= filledPieColor(dst, xx2, yy2, rad,   0,  90, color);
+
+	/*
+	* Draw body
+	*/
+	xx1++;
+	xx2--;
+	yy1++;
+	yy2--;
+	if (xx1 <= xx2) {
+		result |= boxColor(dst, xx1, y1, xx2, y2, color);
+	}
+	if (yy1 <= yy2) {
+		result |= boxColor(dst, x1, yy1, xx1-1, yy2, color);
+		result |= boxColor(dst, xx2+1, yy1, x2, yy2, color);
+	}
+
+	return result;
+}
+
+/*!
+\brief Draw rounded-corner box (filled rectangle) with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param rad The radius of the corner arcs of the box.
+\param r The red value of the box to draw. 
+\param g The green value of the box to draw. 
+\param b The blue value of the box to draw. 
+\param a The alpha value of the box to draw. 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int roundedBoxRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2,
+	Sint16 y2, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (roundedBoxColor
+		(dst, x1, y1, x2, y2, rad, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* --------- Clipping routines for line */
+
+/* Clipping based heavily on code from                       */
+/* http://www.ncsa.uiuc.edu/Vis/Graphics/src/clipCohSuth.c   */
+
+#define CLIP_LEFT_EDGE   0x1
+#define CLIP_RIGHT_EDGE  0x2
+#define CLIP_BOTTOM_EDGE 0x4
+#define CLIP_TOP_EDGE    0x8
+#define CLIP_INSIDE(a)   (!a)
+#define CLIP_REJECT(a,b) (a&b)
+#define CLIP_ACCEPT(a,b) (!(a|b))
+
+/*!
+\brief Internal clip-encoding routine.
+
+Calculates a segement-based clipping encoding for a point against a rectangle.
+
+\param x X coordinate of point.
+\param y Y coordinate of point.
+\param left X coordinate of left edge of the rectangle.
+\param top Y coordinate of top edge of the rectangle.
+\param right X coordinate of right edge of the rectangle.
+\param bottom Y coordinate of bottom edge of the rectangle.
+*/
+static int _clipEncode(Sint16 x, Sint16 y, Sint16 left, Sint16 top, Sint16 right, Sint16 bottom)
+{
+	int code = 0;
+
+	if (x < left) {
+		code |= CLIP_LEFT_EDGE;
+	} else if (x > right) {
+		code |= CLIP_RIGHT_EDGE;
+	}
+	if (y < top) {
+		code |= CLIP_TOP_EDGE;
+	} else if (y > bottom) {
+		code |= CLIP_BOTTOM_EDGE;
+	}
+	return code;
+}
+
+/*!
+\brief Clip line to a the clipping rectangle of a surface.
+
+\param dst Target surface to draw on.
+\param x1 Pointer to X coordinate of first point of line.
+\param y1 Pointer to Y coordinate of first point of line.
+\param x2 Pointer to X coordinate of second point of line.
+\param y2 Pointer to Y coordinate of second point of line.
+*/
+static int _clipLine(SDL_Surface * dst, Sint16 * x1, Sint16 * y1, Sint16 * x2, Sint16 * y2)
+{
+	Sint16 left, right, top, bottom;
+	int code1, code2;
+	int draw = 0;
+	Sint16 swaptmp;
+	float m;
+
+	/*
+	* Get clipping boundary 
+	*/
+	left = dst->clip_rect.x;
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	top = dst->clip_rect.y;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+
+	while (1) {
+		code1 = _clipEncode(*x1, *y1, left, top, right, bottom);
+		code2 = _clipEncode(*x2, *y2, left, top, right, bottom);
+		if (CLIP_ACCEPT(code1, code2)) {
+			draw = 1;
+			break;
+		} else if (CLIP_REJECT(code1, code2))
+			break;
+		else {
+			if (CLIP_INSIDE(code1)) {
+				swaptmp = *x2;
+				*x2 = *x1;
+				*x1 = swaptmp;
+				swaptmp = *y2;
+				*y2 = *y1;
+				*y1 = swaptmp;
+				swaptmp = code2;
+				code2 = code1;
+				code1 = swaptmp;
+			}
+			if (*x2 != *x1) {
+				m = (float)(*y2 - *y1) / (float)(*x2 - *x1);
+			} else {
+				m = 1.0f;
+			}
+			if (code1 & CLIP_LEFT_EDGE) {
+				*y1 += (Sint16) ((left - *x1) * m);
+				*x1 = left;
+			} else if (code1 & CLIP_RIGHT_EDGE) {
+				*y1 += (Sint16) ((right - *x1) * m);
+				*x1 = right;
+			} else if (code1 & CLIP_BOTTOM_EDGE) {
+				if (*x2 != *x1) {
+					*x1 += (Sint16) ((bottom - *y1) / m);
+				}
+				*y1 = bottom;
+			} else if (code1 & CLIP_TOP_EDGE) {
+				if (*x2 != *x1) {
+					*x1 += (Sint16) ((top - *y1) / m);
+				}
+				*y1 = top;
+			}
+		}
+	}
+
+	return draw;
+}
+
+/*!
+\brief Draw box (filled rectangle) with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param color The color value of the box to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int boxColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	Uint8 *pixel, *pixellast;
+	int x, dx;
+	int dy;
+	int pixx, pixy;
+	Sint16 w, h, tmp;
+	int result;
+	Uint8 *colorptr;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Order coordinates to ensure that
+	* x1<=x2 and y1<=y2 
+	*/
+	if (x1 > x2) {
+		tmp = x1;
+		x1 = x2;
+		x2 = tmp;
+	}
+	if (y1 > y2) {
+		tmp = y1;
+		y1 = y2;
+		y2 = tmp;
+	}
+
+	/* 
+	* Get clipping boundary and 
+	* check visibility 
+	*/
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	}
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	}
+	top = dst->clip_rect.y;
+	if (y2<top) {
+		return(0);
+	}
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if (y1>bottom) {
+		return(0);
+	}
+
+	/* Clip all points */
+	if (x1<left) { 
+		x1=left; 
+	} else if (x1>right) {
+		x1=right;
+	}
+	if (x2<left) { 
+		x2=left; 
+	} else if (x2>right) {
+		x2=right;
+	}
+	if (y1<top) { 
+		y1=top; 
+	} else if (y1>bottom) {
+		y1=bottom;
+	}
+	if (y2<top) { 
+		y2=top; 
+	} else if (y2>bottom) {
+		y2=bottom;
+	}
+
+	/*
+	* Test for special cases of straight line or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 == y2) {
+			return (pixelColor(dst, x1, y1, color));
+		} else { 
+			return (vlineColor(dst, x1, y1, y2, color));
+		}
+	}
+	if (y1 == y2) {
+		return (hlineColor(dst, x1, x2, y1, color));
+	}
+
+	/*
+	* Calculate width&height 
+	*/
+	w = x2 - x1;
+	h = y2 - y1;
+
+	/*
+	* Alpha check 
+	*/
+	if ((color & 255) == 255) {
+
+		/*
+		* No alpha-blending required 
+		*/
+
+		/*
+		* Setup color 
+		*/
+		colorptr = (Uint8 *) & color;
+		if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+			color = SDL_MapRGBA(dst->format, colorptr[0], colorptr[1], colorptr[2], colorptr[3]);
+		} else {
+			color = SDL_MapRGBA(dst->format, colorptr[3], colorptr[2], colorptr[1], colorptr[0]);
+		}
+
+		/*
+		* Lock the surface 
+		*/
+		if (SDL_MUSTLOCK(dst)) {
+			if (SDL_LockSurface(dst) < 0) {
+				return (-1);
+			}
+		}
+
+		/*
+		* More variable setup 
+		*/
+		dx = w;
+		dy = h;
+		pixx = dst->format->BytesPerPixel;
+		pixy = dst->pitch;
+		pixel = ((Uint8 *) dst->pixels) + pixx * (int) x1 + pixy * (int) y1;
+		pixellast = pixel + pixx * dx + pixy * dy;
+		dx++;
+
+		/*
+		* Draw 
+		*/
+		switch (dst->format->BytesPerPixel) {
+		case 1:
+			for (; pixel <= pixellast; pixel += pixy) {
+				memset(pixel, (Uint8) color, dx);
+			}
+			break;
+		case 2:
+			pixy -= (pixx * dx);
+			for (; pixel <= pixellast; pixel += pixy) {
+				for (x = 0; x < dx; x++) {
+					*(Uint16*) pixel = color;
+					pixel += pixx;
+				}
+			}
+			break;
+		case 3:
+			pixy -= (pixx * dx);
+			for (; pixel <= pixellast; pixel += pixy) {
+				for (x = 0; x < dx; x++) {
+					if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+						pixel[0] = (color >> 16) & 0xff;
+						pixel[1] = (color >> 8) & 0xff;
+						pixel[2] = color & 0xff;
+					} else {
+						pixel[0] = color & 0xff;
+						pixel[1] = (color >> 8) & 0xff;
+						pixel[2] = (color >> 16) & 0xff;
+					}
+					pixel += pixx;
+				}
+			}
+			break;
+		default:		/* case 4 */
+			pixy -= (pixx * dx);
+			for (; pixel <= pixellast; pixel += pixy) {
+				for (x = 0; x < dx; x++) {
+					*(Uint32 *) pixel = color;
+					pixel += pixx;
+				}
+			}
+			break;
+		}
+
+		/* Unlock surface */
+		if (SDL_MUSTLOCK(dst)) {
+			SDL_UnlockSurface(dst);
+		}
+
+		result = 0;
+
+	} else {
+
+		result = filledRectAlpha(dst, x1, y1, x1 + w, y1 + h, color);
+
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw box (filled rectangle) with blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. top right) of the box.
+\param y1 Y coordinate of the first point (i.e. top right) of the box.
+\param x2 X coordinate of the second point (i.e. bottom left) of the box.
+\param y2 Y coordinate of the second point (i.e. bottom left) of the box.
+\param r The red value of the box to draw. 
+\param g The green value of the box to draw. 
+\param b The blue value of the box to draw. 
+\param a The alpha value of the box to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int boxRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (boxColor(dst, x1, y1, x2, y2, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ----- Line */
+
+/* Non-alpha line drawing code adapted from routine          */
+/* by Pete Shinners, pete@shinners.org                       */
+/* Originally from pygame, http://pygame.seul.org            */
+
+#define ABS(a) (((a)<0) ? -(a) : (a))
+
+/*!
+\brief Draw line with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int lineColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	int pixx, pixy;
+	int x, y;
+	int dx, dy;
+	int ax, ay;
+	int sx, sy;
+	int swaptmp;
+	Uint8 *pixel;
+	Uint8 *colorptr;
+
+	/*
+	* Clip line and test if we have to draw 
+	*/
+	if (!(_clipLine(dst, &x1, &y1, &x2, &y2))) {
+		return (0);
+	}
+
+	/*
+	* Test for special cases of straight lines or single point 
+	*/
+	if (x1 == x2) {
+		if (y1 < y2) {
+			return (vlineColor(dst, x1, y1, y2, color));
+		} else if (y1 > y2) {
+			return (vlineColor(dst, x1, y2, y1, color));
+		} else {
+			return (pixelColor(dst, x1, y1, color));
+		}
+	}
+	if (y1 == y2) {
+		if (x1 < x2) {
+			return (hlineColor(dst, x1, x2, y1, color));
+		} else if (x1 > x2) {
+			return (hlineColor(dst, x2, x1, y1, color));
+		}
+	}
+
+	/*
+	* Variable setup 
+	*/
+	dx = x2 - x1;
+	dy = y2 - y1;
+	sx = (dx >= 0) ? 1 : -1;
+	sy = (dy >= 0) ? 1 : -1;
+
+	/* Lock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	/*
+	* Check for alpha blending 
+	*/
+	if ((color & 255) == 255) {
+
+		/*
+		* No alpha blending - use fast pixel routines 
+		*/
+
+		/*
+		* Setup color 
+		*/
+		colorptr = (Uint8 *) & color;
+		if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+			color = SDL_MapRGBA(dst->format, colorptr[0], colorptr[1], colorptr[2], colorptr[3]);
+		} else {
+			color = SDL_MapRGBA(dst->format, colorptr[3], colorptr[2], colorptr[1], colorptr[0]);
+		}
+
+		/*
+		* More variable setup 
+		*/
+		dx = sx * dx + 1;
+		dy = sy * dy + 1;
+		pixx = dst->format->BytesPerPixel;
+		pixy = dst->pitch;
+		pixel = ((Uint8 *) dst->pixels) + pixx * (int) x1 + pixy * (int) y1;
+		pixx *= sx;
+		pixy *= sy;
+		if (dx < dy) {
+			swaptmp = dx;
+			dx = dy;
+			dy = swaptmp;
+			swaptmp = pixx;
+			pixx = pixy;
+			pixy = swaptmp;
+		}
+
+		/*
+		* Draw 
+		*/
+		x = 0;
+		y = 0;
+		switch (dst->format->BytesPerPixel) {
+		case 1:
+			for (; x < dx; x++, pixel += pixx) {
+				*pixel = color;
+				y += dy;
+				if (y >= dx) {
+					y -= dx;
+					pixel += pixy;
+				}
+			}
+			break;
+		case 2:
+			for (; x < dx; x++, pixel += pixx) {
+				*(Uint16 *) pixel = color;
+				y += dy;
+				if (y >= dx) {
+					y -= dx;
+					pixel += pixy;
+				}
+			}
+			break;
+		case 3:
+			for (; x < dx; x++, pixel += pixx) {
+				if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+					pixel[0] = (color >> 16) & 0xff;
+					pixel[1] = (color >> 8) & 0xff;
+					pixel[2] = color & 0xff;
+				} else {
+					pixel[0] = color & 0xff;
+					pixel[1] = (color >> 8) & 0xff;
+					pixel[2] = (color >> 16) & 0xff;
+				}
+				y += dy;
+				if (y >= dx) {
+					y -= dx;
+					pixel += pixy;
+				}
+			}
+			break;
+		default:		/* case 4 */
+			for (; x < dx; x++, pixel += pixx) {
+				*(Uint32 *) pixel = color;
+				y += dy;
+				if (y >= dx) {
+					y -= dx;
+					pixel += pixy;
+				}
+			}
+			break;
+		}
+
+	} else {
+
+		/*
+		* Alpha blending required - use single-pixel blits 
+		*/
+
+		ax = ABS(dx) << 1;
+		ay = ABS(dy) << 1;
+		x = x1;
+		y = y1;
+		if (ax > ay) {
+			int d = ay - (ax >> 1);
+
+			while (x != x2) {
+				pixelColorNolock (dst, x, y, color);
+				if (d > 0 || (d == 0 && sx == 1)) {
+					y += sy;
+					d -= ax;
+				}
+				x += sx;
+				d += ay;
+			}
+		} else {
+			int d = ax - (ay >> 1);
+
+			while (y != y2) {
+				pixelColorNolock (dst, x, y, color);
+				if (d > 0 || ((d == 0) && (sy == 1))) {
+					x += sx;
+					d -= ay;
+				}
+				y += sy;
+				d += ax;
+			}
+		}
+		pixelColorNolock (dst, x, y, color);
+
+	}
+
+	/* Unlock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return (0);
+}
+
+/*!
+\brief Draw line with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param r The red value of the line to draw. 
+\param g The green value of the line to draw. 
+\param b The blue value of the line to draw. 
+\param a The alpha value of the line to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int lineRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (lineColor(dst, x1, y1, x2, y2, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* AA Line */
+
+#define AAlevels 256
+#define AAbits 8
+
+/*!
+\brief Internal function to draw anti-aliased line with alpha blending and endpoint control.
+
+This implementation of the Wu antialiasing code is based on Mike Abrash's
+DDJ article which was reprinted as Chapter 42 of his Graphics Programming
+Black Book, but has been optimized to work with SDL and utilizes 32-bit
+fixed-point arithmetic by A. Schiffler. The endpoint control allows the
+supression to draw the last pixel useful for rendering continous aa-lines
+with alpha<255.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the aa-line.
+\param y1 Y coordinate of the first point of the aa-line.
+\param x2 X coordinate of the second point of the aa-line.
+\param y2 Y coordinate of the second point of the aa-line.
+\param color The color value of the aa-line to draw (0xRRGGBBAA).
+\param draw_endpoint Flag indicating if the endpoint should be drawn; draw if non-zero.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _aalineColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color, int draw_endpoint)
+{
+	Sint32 xx0, yy0, xx1, yy1;
+	int result;
+	Uint32 intshift, erracc, erradj;
+	Uint32 erracctmp, wgt, wgtcompmask;
+	int dx, dy, tmp, xdir, y0p1, x0pxdir;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Clip line and test if we have to draw 
+	*/
+	if (!(_clipLine(dst, &x1, &y1, &x2, &y2))) {
+		return (0);
+	}
+
+	/*
+	* Keep on working with 32bit numbers 
+	*/
+	xx0 = x1;
+	yy0 = y1;
+	xx1 = x2;
+	yy1 = y2;
+
+	/*
+	* Reorder points if required 
+	*/
+	if (yy0 > yy1) {
+		tmp = yy0;
+		yy0 = yy1;
+		yy1 = tmp;
+		tmp = xx0;
+		xx0 = xx1;
+		xx1 = tmp;
+	}
+
+	/*
+	* Calculate distance 
+	*/
+	dx = xx1 - xx0;
+	dy = yy1 - yy0;
+
+	/*
+	* Check for special cases 
+	*/
+	if (dx == 0) {
+		/*
+		* Vertical line 
+		*/
+		if (draw_endpoint)
+		{
+			return (vlineColor(dst, x1, y1, y2, color));
+		} else {
+			if (dy>0) {
+				return (vlineColor(dst, x1, yy0, yy0+dy, color));
+			} else {
+				return (pixelColor(dst, x1, y1, color));
+			}
+		}
+	} else if (dy == 0) {
+		/*
+		* Horizontal line 
+		*/
+		if (draw_endpoint)
+		{
+			return (hlineColor(dst, x1, x2, y1, color));
+		} else {
+			if (dx!=0) {
+				return (hlineColor(dst, xx0, xx0+dx, y1, color));
+			} else {
+				return (pixelColor(dst, x1, y1, color));
+			}
+		}
+	} else if ((dx == dy) && (draw_endpoint)) {
+		/*
+		* Diagonal line (with endpoint)
+		*/
+		return (lineColor(dst, x1, y1, x2, y2, color));
+	}
+
+	/*
+	* Adjust for negative dx and set xdir 
+	*/
+	if (dx >= 0) {
+		xdir = 1;
+	} else {
+		xdir = -1;
+		dx = (-dx);
+	}
+
+	/*
+	* Line is not horizontal, vertical or diagonal (with endpoint)
+	*/
+	result = 0;
+
+	/*
+	* Zero accumulator 
+	*/
+	erracc = 0;
+
+	/*
+	* # of bits by which to shift erracc to get intensity level 
+	*/
+	intshift = 32 - AAbits;
+
+	/*
+	* Mask used to flip all bits in an intensity weighting 
+	*/
+	wgtcompmask = AAlevels - 1;
+
+	/* Lock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	/*
+	* Draw the initial pixel in the foreground color 
+	*/
+	result |= pixelColorNolock(dst, x1, y1, color);
+
+	/*
+	* x-major or y-major? 
+	*/
+	if (dy > dx) {
+
+		/*
+		* y-major.  Calculate 16-bit fixed point fractional part of a pixel that
+		* X advances every time Y advances 1 pixel, truncating the result so that
+		* we won't overrun the endpoint along the X axis 
+		*/
+		/*
+		* Not-so-portable version: erradj = ((Uint64)dx << 32) / (Uint64)dy; 
+		*/
+		erradj = ((dx << 16) / dy) << 16;
+
+		/*
+		* draw all pixels other than the first and last 
+		*/
+		x0pxdir = xx0 + xdir;
+		while (--dy) {
+			erracctmp = erracc;
+			erracc += erradj;
+			if (erracc <= erracctmp) {
+				/*
+				* rollover in error accumulator, x coord advances 
+				*/
+				xx0 = x0pxdir;
+				x0pxdir += xdir;
+			}
+			yy0++;		/* y-major so always advance Y */
+
+			/*
+			* the AAbits most significant bits of erracc give us the intensity
+			* weighting for this pixel, and the complement of the weighting for
+			* the paired pixel. 
+			*/
+			wgt = (erracc >> intshift) & 255;
+			result |= pixelColorWeightNolock (dst, xx0, yy0, color, 255 - wgt);
+			result |= pixelColorWeightNolock (dst, x0pxdir, yy0, color, wgt);
+		}
+
+	} else {
+
+		/*
+		* x-major line.  Calculate 16-bit fixed-point fractional part of a pixel
+		* that Y advances each time X advances 1 pixel, truncating the result so
+		* that we won't overrun the endpoint along the X axis. 
+		*/
+		/*
+		* Not-so-portable version: erradj = ((Uint64)dy << 32) / (Uint64)dx; 
+		*/
+		erradj = ((dy << 16) / dx) << 16;
+
+		/*
+		* draw all pixels other than the first and last 
+		*/
+		y0p1 = yy0 + 1;
+		while (--dx) {
+
+			erracctmp = erracc;
+			erracc += erradj;
+			if (erracc <= erracctmp) {
+				/*
+				* Accumulator turned over, advance y 
+				*/
+				yy0 = y0p1;
+				y0p1++;
+			}
+			xx0 += xdir;	/* x-major so always advance X */
+			/*
+			* the AAbits most significant bits of erracc give us the intensity
+			* weighting for this pixel, and the complement of the weighting for
+			* the paired pixel. 
+			*/
+			wgt = (erracc >> intshift) & 255;
+			result |= pixelColorWeightNolock (dst, xx0, yy0, color, 255 - wgt);
+			result |= pixelColorWeightNolock (dst, xx0, y0p1, color, wgt);
+		}
+	}
+
+	/*
+	* Do we have to draw the endpoint 
+	*/
+	if (draw_endpoint) {
+		/*
+		* Draw final pixel, always exactly intersected by the line and doesn't
+		* need to be weighted. 
+		*/
+		result |= pixelColorNolock (dst, x2, y2, color);
+	}
+
+	/* Unlock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Ddraw anti-aliased line with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the aa-line.
+\param y1 Y coordinate of the first point of the aa-line.
+\param x2 X coordinate of the second point of the aa-line.
+\param y2 Y coordinate of the second point of the aa-line.
+\param color The color value of the aa-line to draw (0xRRGGBBAA).
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aalineColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint32 color)
+{
+	return (_aalineColor(dst, x1, y1, x2, y2, color, 1));
+}
+
+/*!
+\brief Draw anti-aliased line with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the aa-line.
+\param y1 Y coordinate of the first point of the aa-line.
+\param x2 X coordinate of the second point of the aa-line.
+\param y2 Y coordinate of the second point of the aa-line.
+\param r The red value of the aa-line to draw. 
+\param g The green value of the aa-line to draw. 
+\param b The blue value of the aa-line to draw. 
+\param a The alpha value of the aa-line to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aalineRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return (_aalineColor
+		(dst, x1, y1, x2, y2, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a, 1));
+}
+
+
+/* ----- Circle */
+
+/*!
+\brief Draw circle with blending.
+
+Note: Circle drawing routine is based on an algorithms from the sge library, 
+but modified by A. Schiffler for multiple pixel-draw removal and other 
+minor speedup changes.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the circle.
+\param y Y coordinate of the center of the circle.
+\param rad Radius in pixels of the circle.
+\param color The color value of the circle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int circleColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	int result;
+	Sint16 x1, y1, x2, y2;
+	Sint16 cx = 0;
+	Sint16 cy = rad;
+	Sint16 df = 1 - rad;
+	Sint16 d_e = 3;
+	Sint16 d_se = -2 * rad + 5;
+	Sint16 xpcx, xmcx, xpcy, xmcy;
+	Sint16 ypcy, ymcy, ypcx, ymcx;
+	Uint8 *colorptr;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Sanity check radius 
+	*/
+	if (rad < 0) {
+		return (-1);
+	}
+
+	/*
+	* Special case for rad=0 - draw a point 
+	*/
+	if (rad == 0) {
+		return (pixelColor(dst, x, y, color));
+	}
+
+	/*
+	* Get circle and clipping boundary and 
+	* test if bounding box of circle is visible 
+	*/
+	x2 = x + rad;
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	} 
+	x1 = x - rad;
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	} 
+	y2 = y + rad;
+	top = dst->clip_rect.y;
+	if (y2<top) {
+		return(0);
+	} 
+	y1 = y - rad;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if (y1>bottom) {
+		return(0);
+	} 
+
+	/*
+	* Draw circle 
+	*/
+	result = 0;
+
+	/* Lock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	/*
+	* Alpha Check 
+	*/
+	if ((color & 255) == 255) {
+
+		/*
+		* No Alpha - direct memory writes 
+		*/
+
+		/*
+		* Setup color 
+		*/
+		colorptr = (Uint8 *) & color;
+		if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+			color = SDL_MapRGBA(dst->format, colorptr[0], colorptr[1], colorptr[2], colorptr[3]);
+		} else {
+			color = SDL_MapRGBA(dst->format, colorptr[3], colorptr[2], colorptr[1], colorptr[0]);
+		}
+
+		/*
+		* Draw 
+		*/
+		do {
+			ypcy = y + cy;
+			ymcy = y - cy;
+			if (cx > 0) {
+				xpcx = x + cx;
+				xmcx = x - cx;
+				result |= fastPixelColorNolock(dst, xmcx, ypcy, color);
+				result |= fastPixelColorNolock(dst, xpcx, ypcy, color);
+				result |= fastPixelColorNolock(dst, xmcx, ymcy, color);
+				result |= fastPixelColorNolock(dst, xpcx, ymcy, color);
+			} else {
+				result |= fastPixelColorNolock(dst, x, ymcy, color);
+				result |= fastPixelColorNolock(dst, x, ypcy, color);
+			}
+			xpcy = x + cy;
+			xmcy = x - cy;
+			if ((cx > 0) && (cx != cy)) {
+				ypcx = y + cx;
+				ymcx = y - cx;
+				result |= fastPixelColorNolock(dst, xmcy, ypcx, color);
+				result |= fastPixelColorNolock(dst, xpcy, ypcx, color);
+				result |= fastPixelColorNolock(dst, xmcy, ymcx, color);
+				result |= fastPixelColorNolock(dst, xpcy, ymcx, color);
+			} else if (cx == 0) {
+				result |= fastPixelColorNolock(dst, xmcy, y, color);
+				result |= fastPixelColorNolock(dst, xpcy, y, color);
+			}
+			/*
+			* Update 
+			*/
+			if (df < 0) {
+				df += d_e;
+				d_e += 2;
+				d_se += 2;
+			} else {
+				df += d_se;
+				d_e += 2;
+				d_se += 4;
+				cy--;
+			}
+			cx++;
+		} while (cx <= cy);
+
+		/*
+		* Unlock surface 
+		*/
+		SDL_UnlockSurface(dst);
+
+	} else {
+
+		/*
+		* Using Alpha - blended pixel blits 
+		*/
+
+		do {
+			/*
+			* Draw 
+			*/
+			ypcy = y + cy;
+			ymcy = y - cy;
+			if (cx > 0) {
+				xpcx = x + cx;
+				xmcx = x - cx;
+				result |= pixelColorNolock (dst, xmcx, ypcy, color);
+				result |= pixelColorNolock (dst, xpcx, ypcy, color);
+				result |= pixelColorNolock (dst, xmcx, ymcy, color);
+				result |= pixelColorNolock (dst, xpcx, ymcy, color);
+			} else {
+				result |= pixelColorNolock (dst, x, ymcy, color);
+				result |= pixelColorNolock (dst, x, ypcy, color);
+			}
+			xpcy = x + cy;
+			xmcy = x - cy;
+			if ((cx > 0) && (cx != cy)) {
+				ypcx = y + cx;
+				ymcx = y - cx;
+				result |= pixelColorNolock (dst, xmcy, ypcx, color);
+				result |= pixelColorNolock (dst, xpcy, ypcx, color);
+				result |= pixelColorNolock (dst, xmcy, ymcx, color);
+				result |= pixelColorNolock (dst, xpcy, ymcx, color);
+			} else if (cx == 0) {
+				result |= pixelColorNolock (dst, xmcy, y, color);
+				result |= pixelColorNolock (dst, xpcy, y, color);
+			}
+			/*
+			* Update 
+			*/
+			if (df < 0) {
+				df += d_e;
+				d_e += 2;
+				d_se += 2;
+			} else {
+				df += d_se;
+				d_e += 2;
+				d_se += 4;
+				cy--;
+			}
+			cx++;
+		} while (cx <= cy);
+
+	}				/* Alpha check */
+
+	/* Unlock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw circle with blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the circle.
+\param y Y coordinate of the center of the circle.
+\param rad Radius in pixels of the circle.
+\param r The red value of the circle to draw. 
+\param g The green value of the circle to draw. 
+\param b The blue value of the circle to draw. 
+\param a The alpha value of the circle to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int circleRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (circleColor(dst, x, y, rad, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ----- Arc */
+
+/*!
+\brief Arc with blending.
+
+Note Arc drawing is based on circle algorithm by A. Schiffler and 
+written by D. Raber. Calculates which octants arc goes through and 
+renders pixels accordingly.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the arc.
+\param y Y coordinate of the center of the arc.
+\param rad Radius in pixels of the arc.
+\param start Starting radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param end Ending radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param color The color value of the arc to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int arcColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	int result;
+	Sint16 x1, y1, x2, y2;
+	Sint16 cx = 0;
+	Sint16 cy = rad;
+	Sint16 df = 1 - rad;
+	Sint16 d_e = 3;
+	Sint16 d_se = -2 * rad + 5;
+	Sint16 xpcx, xmcx, xpcy, xmcy;
+	Sint16 ypcy, ymcy, ypcx, ymcx;
+	Uint8 *colorptr;
+	Uint8 drawoct;
+	int startoct, endoct, oct, stopval_start = 0, stopval_end = 0;
+	double dstart, dend, temp = 0.;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Sanity check radius 
+	*/
+	if (rad < 0) {
+		return (-1);
+	}
+
+	/*
+	* Special case for rad=0 - draw a point 
+	*/
+	if (rad == 0) {
+		return (pixelColor(dst, x, y, color));
+	}
+
+	/*
+	* Get arc's circle and clipping boundary and 
+	* test if bounding box of circle is visible 
+	*/
+	x2 = x + rad;
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	} 
+	x1 = x - rad;
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	} 
+	y2 = y + rad;
+	top = dst->clip_rect.y;
+	if (y2<top) {
+		return(0);
+	} 
+	y1 = y - rad;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if (y1>bottom) {
+		return(0);
+	}  
+
+	// Octant labelling
+	//      
+	//  \ 5 | 6 /
+	//   \  |  /
+	//  4 \ | / 7
+	//     \|/
+	//------+------ +x
+	//     /|\
+	//  3 / | \ 0
+	//   /  |  \
+	//  / 2 | 1 \
+	//      +y
+
+	// Initially reset bitmask to 0x00000000
+	// the set whether or not to keep drawing a given octant.
+	// For example: 0x00111100 means we're drawing in octants 2-5
+	drawoct = 0; 
+
+	/*
+	* Fixup angles
+	*/
+	start %= 360;
+	end %= 360;
+	// 0 <= start & end < 360; note that sometimes start > end - if so, arc goes back through 0.
+	while (start < 0) start += 360;
+	while (end < 0) end += 360;
+	start %= 360;
+	end %= 360;
+
+	// now, we find which octants we're drawing in.
+	startoct = start / 45;
+	endoct = end / 45;
+	oct = startoct - 1; // we increment as first step in loop
+
+	// stopval_start, stopval_end; 
+	// what values of cx to stop at.
+	do {
+		oct = (oct + 1) % 8;
+
+		if (oct == startoct) {
+			// need to compute stopval_start for this octant.  Look at picture above if this is unclear
+			dstart = (double)start;
+			switch (oct) 
+			{
+			case 0:
+			case 3:
+				temp = sin(dstart * M_PI / 180.);
+				break;
+			case 1:
+			case 6:
+				temp = cos(dstart * M_PI / 180.);
+				break;
+			case 2:
+			case 5:
+				temp = -cos(dstart * M_PI / 180.);
+				break;
+			case 4:
+			case 7:
+				temp = -sin(dstart * M_PI / 180.);
+				break;
+			}
+			temp *= rad;
+			stopval_start = (int)temp; // always round down.
+			// This isn't arbitrary, but requires graph paper to explain well.
+			// The basic idea is that we're always changing drawoct after we draw, so we
+			// stop immediately after we render the last sensible pixel at x = ((int)temp).
+
+			// and whether to draw in this octant initially
+			if (oct % 2) drawoct |= (1 << oct); // this is basically like saying drawoct[oct] = true, if drawoct were a bool array
+			else		 drawoct &= 255 - (1 << oct); // this is basically like saying drawoct[oct] = false
+		}
+		if (oct == endoct) {
+			// need to compute stopval_end for this octant
+			dend = (double)end;
+			switch (oct)
+			{
+			case 0:
+			case 3:
+				temp = sin(dend * M_PI / 180);
+				break;
+			case 1:
+			case 6:
+				temp = cos(dend * M_PI / 180);
+				break;
+			case 2:
+			case 5:
+				temp = -cos(dend * M_PI / 180);
+				break;
+			case 4:
+			case 7:
+				temp = -sin(dend * M_PI / 180);
+				break;
+			}
+			temp *= rad;
+			stopval_end = (int)temp;
+
+			// and whether to draw in this octant initially
+			if (startoct == endoct)	{
+				// note:      we start drawing, stop, then start again in this case
+				// otherwise: we only draw in this octant, so initialize it to false, it will get set back to true
+				if (start > end) {
+					// unfortunately, if we're in the same octant and need to draw over the whole circle, 
+					// we need to set the rest to true, because the while loop will end at the bottom.
+					drawoct = 255;
+				} else {
+					drawoct &= 255 - (1 << oct);
+				}
+			} 
+			else if (oct % 2) drawoct &= 255 - (1 << oct);
+			else			  drawoct |= (1 << oct);
+		} else if (oct != startoct) { // already verified that it's != endoct
+			drawoct |= (1 << oct); // draw this entire segment
+		}
+	} while (oct != endoct);
+
+	// so now we have what octants to draw and when to draw them.  all that's left is the actual raster code.
+
+	/* Lock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	/*
+	* Draw arc 
+	*/
+	result = 0;
+
+	/*
+	* Alpha Check 
+	*/
+	if ((color & 255) == 255) {
+
+		/*
+		* No Alpha - direct memory writes 
+		*/
+
+		/*
+		* Setup color 
+		*/
+		colorptr = (Uint8 *) & color;
+		if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+			color = SDL_MapRGBA(dst->format, colorptr[0], colorptr[1], colorptr[2], colorptr[3]);
+		} else {
+			color = SDL_MapRGBA(dst->format, colorptr[3], colorptr[2], colorptr[1], colorptr[0]);
+		}
+
+		/*
+		* Draw 
+		*/
+		do {
+			ypcy = y + cy;
+			ymcy = y - cy;
+			if (cx > 0) {
+				xpcx = x + cx;
+				xmcx = x - cx;
+				// always check if we're drawing a certain octant before adding a pixel to that octant.
+				if (drawoct & 4)  result |= fastPixelColorNolock(dst, xmcx, ypcy, color); // drawoct & 4 = 22; drawoct[2]
+				if (drawoct & 2)  result |= fastPixelColorNolock(dst, xpcx, ypcy, color);
+				if (drawoct & 32) result |= fastPixelColorNolock(dst, xmcx, ymcy, color);
+				if (drawoct & 64) result |= fastPixelColorNolock(dst, xpcx, ymcy, color);
+			} else {
+				if (drawoct & 6)  result |= fastPixelColorNolock(dst, x, ypcy, color); // 4 + 2; drawoct[2] || drawoct[1]
+				if (drawoct & 96) result |= fastPixelColorNolock(dst, x, ymcy, color); // 32 + 64
+			}
+
+			xpcy = x + cy;
+			xmcy = x - cy;
+			if (cx > 0 && cx != cy) {
+				ypcx = y + cx;
+				ymcx = y - cx;
+				if (drawoct & 8)   result |= fastPixelColorNolock(dst, xmcy, ypcx, color);
+				if (drawoct & 1)   result |= fastPixelColorNolock(dst, xpcy, ypcx, color);
+				if (drawoct & 16)  result |= fastPixelColorNolock(dst, xmcy, ymcx, color);
+				if (drawoct & 128) result |= fastPixelColorNolock(dst, xpcy, ymcx, color);
+			} else if (cx == 0) {
+				if (drawoct & 24)  result |= fastPixelColorNolock(dst, xmcy, y, color); // 8 + 16
+				if (drawoct & 129) result |= fastPixelColorNolock(dst, xpcy, y, color); // 1 + 128
+			}
+
+			/*
+			* Update whether we're drawing an octant
+			*/
+			if (stopval_start == cx) {
+				// works like an on-off switch because start & end may be in the same octant.
+				if (drawoct & (1 << startoct)) drawoct &= 255 - (1 << startoct);		
+				else drawoct |= (1 << startoct);
+			}
+			if (stopval_end == cx) {
+				if (drawoct & (1 << endoct)) drawoct &= 255 - (1 << endoct);
+				else drawoct |= (1 << endoct);
+			}
+
+			/*
+			* Update pixels
+			*/
+			if (df < 0) {
+				df += d_e;
+				d_e += 2;
+				d_se += 2;
+			} else {
+				df += d_se;
+				d_e += 2;
+				d_se += 4;
+				cy--;
+			}
+			cx++;
+		} while (cx <= cy);
+
+		/*
+		* Unlock surface 
+		*/
+		SDL_UnlockSurface(dst);
+
+	} else {
+
+		/*
+		* Using Alpha - blended pixel blits 
+		*/
+
+		do {
+			ypcy = y + cy;
+			ymcy = y - cy;
+			if (cx > 0) {
+				xpcx = x + cx;
+				xmcx = x - cx;
+
+				// always check if we're drawing a certain octant before adding a pixel to that octant.
+				if (drawoct & 4)  result |= pixelColorNolock(dst, xmcx, ypcy, color);
+				if (drawoct & 2)  result |= pixelColorNolock(dst, xpcx, ypcy, color);
+				if (drawoct & 32) result |= pixelColorNolock(dst, xmcx, ymcy, color);
+				if (drawoct & 64) result |= pixelColorNolock(dst, xpcx, ymcy, color);
+			} else {
+				if (drawoct & 96) result |= pixelColorNolock(dst, x, ymcy, color);
+				if (drawoct & 6)  result |= pixelColorNolock(dst, x, ypcy, color);
+			}
+
+			xpcy = x + cy;
+			xmcy = x - cy;
+			if (cx > 0 && cx != cy) {
+				ypcx = y + cx;
+				ymcx = y - cx;
+				if (drawoct & 8)   result |= pixelColorNolock(dst, xmcy, ypcx, color);
+				if (drawoct & 1)   result |= pixelColorNolock(dst, xpcy, ypcx, color);
+				if (drawoct & 16)  result |= pixelColorNolock(dst, xmcy, ymcx, color);
+				if (drawoct & 128) result |= pixelColorNolock(dst, xpcy, ymcx, color);
+			} else if (cx == 0) {
+				if (drawoct & 24)  result |= pixelColorNolock(dst, xmcy, y, color);
+				if (drawoct & 129) result |= pixelColorNolock(dst, xpcy, y, color);
+			}
+
+			/*
+			* Update whether we're drawing an octant
+			*/
+			if (stopval_start == cx) {
+				// works like an on-off switch.  
+				// This is just in case start & end are in the same octant.
+				if (drawoct & (1 << startoct)) drawoct &= 255 - (1 << startoct);		
+				else						   drawoct |= (1 << startoct);
+			}
+			if (stopval_end == cx) {
+				if (drawoct & (1 << endoct)) drawoct &= 255 - (1 << endoct);
+				else						 drawoct |= (1 << endoct);
+			}
+
+			/*
+			* Update pixels
+			*/
+			if (df < 0) {
+				df += d_e;
+				d_e += 2;
+				d_se += 2;
+			} else {
+				df += d_se;
+				d_e += 2;
+				d_se += 4;
+				cy--;
+			}
+			cx++;
+		} while (cx <= cy);
+
+	}				/* Alpha check */
+
+	/* Unlock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Arc with blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the arc.
+\param y Y coordinate of the center of the arc.
+\param rad Radius in pixels of the arc.
+\param start Starting radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param end Ending radius in degrees of the arc. 0 degrees is down, increasing counterclockwise.
+\param r The red value of the arc to draw. 
+\param g The green value of the arc to draw. 
+\param b The blue value of the arc to draw. 
+\param a The alpha value of the arc to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int arcRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (arcColor(dst, x, y, rad, start, end, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ----- AA Circle */
+
+
+/*!
+\brief Draw anti-aliased circle with blending.
+
+Note: The AA-circle routine is based on AA-ellipse with identical radii.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the aa-circle.
+\param y Y coordinate of the center of the aa-circle.
+\param rad Radius in pixels of the aa-circle.
+\param color The color value of the aa-circle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aacircleColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Uint32 color)
+{
+	return (aaellipseColor(dst, x, y, rad, rad, color));
+}
+
+/*!
+\brief Draw anti-aliased circle with blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the aa-circle.
+\param y Y coordinate of the center of the aa-circle.
+\param rad Radius in pixels of the aa-circle.
+\param r The red value of the aa-circle to draw. 
+\param g The green value of the aa-circle to draw. 
+\param b The blue value of the aa-circle to draw. 
+\param a The alpha value of the aa-circle to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aacircleRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (aaellipseColor
+		(dst, x, y, rad, rad, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ----- Filled Circle */
+
+/*!
+\brief Draw filled circle with blending.
+
+Note: Based on algorithms from sge library with modifications by A. Schiffler for
+multiple-hline draw removal and other minor speedup changes.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the filled circle.
+\param y Y coordinate of the center of the filled circle.
+\param rad Radius in pixels of the filled circle.
+\param color The color value of the filled circle to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledCircleColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	int result;
+	Sint16 x1, y1, x2, y2;
+	Sint16 cx = 0;
+	Sint16 cy = rad;
+	Sint16 ocx = (Sint16) 0xffff;
+	Sint16 ocy = (Sint16) 0xffff;
+	Sint16 df = 1 - rad;
+	Sint16 d_e = 3;
+	Sint16 d_se = -2 * rad + 5;
+	Sint16 xpcx, xmcx, xpcy, xmcy;
+	Sint16 ypcy, ymcy, ypcx, ymcx;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Sanity check radius 
+	*/
+	if (rad < 0) {
+		return (-1);
+	}
+
+	/*
+	* Special case for rad=0 - draw a point 
+	*/
+	if (rad == 0) {
+		return (pixelColor(dst, x, y, color));
+	}
+
+	/*
+	* Get circle and clipping boundary and 
+	* test if bounding box of circle is visible 
+	*/
+	x2 = x + rad;
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	} 
+	x1 = x - rad;
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	} 
+	y2 = y + rad;
+	top = dst->clip_rect.y;
+	if (y2<top) {
+		return(0);
+	} 
+	y1 = y - rad;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if (y1>bottom) {
+		return(0);
+	} 
+
+	/*
+	* Draw 
+	*/
+	result = 0;
+	do {
+		xpcx = x + cx;
+		xmcx = x - cx;
+		xpcy = x + cy;
+		xmcy = x - cy;
+		if (ocy != cy) {
+			if (cy > 0) {
+				ypcy = y + cy;
+				ymcy = y - cy;
+				result |= hlineColor(dst, xmcx, xpcx, ypcy, color);
+				result |= hlineColor(dst, xmcx, xpcx, ymcy, color);
+			} else {
+				result |= hlineColor(dst, xmcx, xpcx, y, color);
+			}
+			ocy = cy;
+		}
+		if (ocx != cx) {
+			if (cx != cy) {
+				if (cx > 0) {
+					ypcx = y + cx;
+					ymcx = y - cx;
+					result |= hlineColor(dst, xmcy, xpcy, ymcx, color);
+					result |= hlineColor(dst, xmcy, xpcy, ypcx, color);
+				} else {
+					result |= hlineColor(dst, xmcy, xpcy, y, color);
+				}
+			}
+			ocx = cx;
+		}
+		/*
+		* Update 
+		*/
+		if (df < 0) {
+			df += d_e;
+			d_e += 2;
+			d_se += 2;
+		} else {
+			df += d_se;
+			d_e += 2;
+			d_se += 4;
+			cy--;
+		}
+		cx++;
+	} while (cx <= cy);
+
+	return (result);
+}
+
+/*!
+\brief Draw filled circle with blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the filled circle.
+\param y Y coordinate of the center of the filled circle.
+\param rad Radius in pixels of the filled circle.
+\param r The red value of the filled circle to draw. 
+\param g The green value of the filled circle to draw. 
+\param b The blue value of the filled circle to draw. 
+\param a The alpha value of the filled circle to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledCircleRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (filledCircleColor
+		(dst, x, y, rad, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ----- Ellipse */
+
+/*!
+\brief Draw ellipse with blending.
+
+Note: Based on algorithms from sge library with modifications by A. Schiffler for
+multiple-pixel draw removal and other minor speedup changes.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the ellipse.
+\param y Y coordinate of the center of the ellipse.
+\param rx Horizontal radius in pixels of the ellipse.
+\param ry Vertical radius in pixels of the ellipse.
+\param color The color value of the ellipse to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int ellipseColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	int result;
+	Sint16 x1, y1, x2, y2;
+	int ix, iy;
+	int h, i, j, k;
+	int oh, oi, oj, ok;
+	int xmh, xph, ypk, ymk;
+	int xmi, xpi, ymj, ypj;
+	int xmj, xpj, ymi, ypi;
+	int xmk, xpk, ymh, yph;
+	Uint8 *colorptr;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Sanity check radii 
+	*/
+	if ((rx < 0) || (ry < 0)) {
+		return (-1);
+	}
+
+	/*
+	* Special case for rx=0 - draw a vline 
+	*/
+	if (rx == 0) {
+		return (vlineColor(dst, x, y - ry, y + ry, color));
+	}
+	/*
+	* Special case for ry=0 - draw a hline 
+	*/
+	if (ry == 0) {
+		return (hlineColor(dst, x - rx, x + rx, y, color));
+	}
+
+	/*
+	* Get circle and clipping boundary and 
+	* test if bounding box of circle is visible 
+	*/
+	x2 = x + rx;
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	} 
+	x1 = x - rx;
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	} 
+	y2 = y + ry;
+	top = dst->clip_rect.y;
+	if (y2<top) {
+		return(0);
+	} 
+	y1 = y - ry;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if (y1>bottom) {
+		return(0);
+	} 
+
+	/*
+	* Init vars 
+	*/
+	oh = oi = oj = ok = 0xFFFF;
+
+	/*
+	* Draw 
+	*/
+	result = 0;
+
+	/* Lock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	/*
+	* Check alpha 
+	*/
+	if ((color & 255) == 255) {
+
+		/*
+		* No Alpha - direct memory writes 
+		*/
+
+		/*
+		* Setup color 
+		*/
+		colorptr = (Uint8 *) & color;
+		if (SDL_BYTEORDER == SDL_BIG_ENDIAN) {
+			color = SDL_MapRGBA(dst->format, colorptr[0], colorptr[1], colorptr[2], colorptr[3]);
+		} else {
+			color = SDL_MapRGBA(dst->format, colorptr[3], colorptr[2], colorptr[1], colorptr[0]);
+		}
+
+
+		if (rx > ry) {
+			ix = 0;
+			iy = rx * 64;
+
+			do {
+				h = (ix + 32) >> 6;
+				i = (iy + 32) >> 6;
+				j = (h * ry) / rx;
+				k = (i * ry) / rx;
+
+				if (((ok != k) && (oj != k)) || ((oj != j) && (ok != j)) || (k != j)) {
+					xph = x + h;
+					xmh = x - h;
+					if (k > 0) {
+						ypk = y + k;
+						ymk = y - k;
+						result |= fastPixelColorNolock(dst, xmh, ypk, color);
+						result |= fastPixelColorNolock(dst, xph, ypk, color);
+						result |= fastPixelColorNolock(dst, xmh, ymk, color);
+						result |= fastPixelColorNolock(dst, xph, ymk, color);
+					} else {
+						result |= fastPixelColorNolock(dst, xmh, y, color);
+						result |= fastPixelColorNolock(dst, xph, y, color);
+					}
+					ok = k;
+					xpi = x + i;
+					xmi = x - i;
+					if (j > 0) {
+						ypj = y + j;
+						ymj = y - j;
+						result |= fastPixelColorNolock(dst, xmi, ypj, color);
+						result |= fastPixelColorNolock(dst, xpi, ypj, color);
+						result |= fastPixelColorNolock(dst, xmi, ymj, color);
+						result |= fastPixelColorNolock(dst, xpi, ymj, color);
+					} else {
+						result |= fastPixelColorNolock(dst, xmi, y, color);
+						result |= fastPixelColorNolock(dst, xpi, y, color);
+					}
+					oj = j;
+				}
+
+				ix = ix + iy / rx;
+				iy = iy - ix / rx;
+
+			} while (i > h);
+		} else {
+			ix = 0;
+			iy = ry * 64;
+
+			do {
+				h = (ix + 32) >> 6;
+				i = (iy + 32) >> 6;
+				j = (h * rx) / ry;
+				k = (i * rx) / ry;
+
+				if (((oi != i) && (oh != i)) || ((oh != h) && (oi != h) && (i != h))) {
+					xmj = x - j;
+					xpj = x + j;
+					if (i > 0) {
+						ypi = y + i;
+						ymi = y - i;
+						result |= fastPixelColorNolock(dst, xmj, ypi, color);
+						result |= fastPixelColorNolock(dst, xpj, ypi, color);
+						result |= fastPixelColorNolock(dst, xmj, ymi, color);
+						result |= fastPixelColorNolock(dst, xpj, ymi, color);
+					} else {
+						result |= fastPixelColorNolock(dst, xmj, y, color);
+						result |= fastPixelColorNolock(dst, xpj, y, color);
+					}
+					oi = i;
+					xmk = x - k;
+					xpk = x + k;
+					if (h > 0) {
+						yph = y + h;
+						ymh = y - h;
+						result |= fastPixelColorNolock(dst, xmk, yph, color);
+						result |= fastPixelColorNolock(dst, xpk, yph, color);
+						result |= fastPixelColorNolock(dst, xmk, ymh, color);
+						result |= fastPixelColorNolock(dst, xpk, ymh, color);
+					} else {
+						result |= fastPixelColorNolock(dst, xmk, y, color);
+						result |= fastPixelColorNolock(dst, xpk, y, color);
+					}
+					oh = h;
+				}
+
+				ix = ix + iy / ry;
+				iy = iy - ix / ry;
+
+			} while (i > h);
+		}
+
+	} else {
+
+		if (rx > ry) {
+			ix = 0;
+			iy = rx * 64;
+
+			do {
+				h = (ix + 32) >> 6;
+				i = (iy + 32) >> 6;
+				j = (h * ry) / rx;
+				k = (i * ry) / rx;
+
+				if (((ok != k) && (oj != k)) || ((oj != j) && (ok != j)) || (k != j)) {
+					xph = x + h;
+					xmh = x - h;
+					if (k > 0) {
+						ypk = y + k;
+						ymk = y - k;
+						result |= pixelColorNolock (dst, xmh, ypk, color);
+						result |= pixelColorNolock (dst, xph, ypk, color);
+						result |= pixelColorNolock (dst, xmh, ymk, color);
+						result |= pixelColorNolock (dst, xph, ymk, color);
+					} else {
+						result |= pixelColorNolock (dst, xmh, y, color);
+						result |= pixelColorNolock (dst, xph, y, color);
+					}
+					ok = k;
+					xpi = x + i;
+					xmi = x - i;
+					if (j > 0) {
+						ypj = y + j;
+						ymj = y - j;
+						result |= pixelColorNolock (dst, xmi, ypj, color);
+						result |= pixelColorNolock (dst, xpi, ypj, color);
+						result |= pixelColorNolock (dst, xmi, ymj, color);
+						result |= pixelColor(dst, xpi, ymj, color);
+					} else {
+						result |= pixelColorNolock (dst, xmi, y, color);
+						result |= pixelColorNolock (dst, xpi, y, color);
+					}
+					oj = j;
+				}
+
+				ix = ix + iy / rx;
+				iy = iy - ix / rx;
+
+			} while (i > h);
+		} else {
+			ix = 0;
+			iy = ry * 64;
+
+			do {
+				h = (ix + 32) >> 6;
+				i = (iy + 32) >> 6;
+				j = (h * rx) / ry;
+				k = (i * rx) / ry;
+
+				if (((oi != i) && (oh != i)) || ((oh != h) && (oi != h) && (i != h))) {
+					xmj = x - j;
+					xpj = x + j;
+					if (i > 0) {
+						ypi = y + i;
+						ymi = y - i;
+						result |= pixelColorNolock (dst, xmj, ypi, color);
+						result |= pixelColorNolock (dst, xpj, ypi, color);
+						result |= pixelColorNolock (dst, xmj, ymi, color);
+						result |= pixelColorNolock (dst, xpj, ymi, color);
+					} else {
+						result |= pixelColorNolock (dst, xmj, y, color);
+						result |= pixelColorNolock (dst, xpj, y, color);
+					}
+					oi = i;
+					xmk = x - k;
+					xpk = x + k;
+					if (h > 0) {
+						yph = y + h;
+						ymh = y - h;
+						result |= pixelColorNolock (dst, xmk, yph, color);
+						result |= pixelColorNolock (dst, xpk, yph, color);
+						result |= pixelColorNolock (dst, xmk, ymh, color);
+						result |= pixelColorNolock (dst, xpk, ymh, color);
+					} else {
+						result |= pixelColorNolock (dst, xmk, y, color);
+						result |= pixelColorNolock (dst, xpk, y, color);
+					}
+					oh = h;
+				}
+
+				ix = ix + iy / ry;
+				iy = iy - ix / ry;
+
+			} while (i > h);
+		}
+
+	}				/* Alpha check */
+
+	/* Unlock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw ellipse with blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the ellipse.
+\param y Y coordinate of the center of the ellipse.
+\param rx Horizontal radius in pixels of the ellipse.
+\param ry Vertical radius in pixels of the ellipse.
+\param r The red value of the ellipse to draw. 
+\param g The green value of the ellipse to draw. 
+\param b The blue value of the ellipse to draw. 
+\param a The alpha value of the ellipse to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int ellipseRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (ellipseColor(dst, x, y, rx, ry, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ----- AA Ellipse */
+
+/* Windows targets do not have lrint, so provide a local inline version */
+#if defined(_MSC_VER)
+/* Detect 64bit and use intrinsic version */
+#ifdef _M_X64
+#include <emmintrin.h>
+static __inline long 
+	lrint(float f) 
+{
+	return _mm_cvtss_si32(_mm_load_ss(&f));
+}
+#elif defined(_M_IX86)
+__inline long int
+	lrint (double flt)
+{	
+	int intgr;
+	_asm
+	{
+		fld flt
+			fistp intgr
+	};
+	return intgr;
+}
+#elif defined(_M_ARM)
+#include <armintr.h>
+#pragma warning(push)
+#pragma warning(disable: 4716)
+__declspec(naked) long int
+	lrint (double flt)
+{
+	__emit(0xEC410B10); // fmdrr  d0, r0, r1
+	__emit(0xEEBD0B40); // ftosid s0, d0
+	__emit(0xEE100A10); // fmrs   r0, s0
+	__emit(0xE12FFF1E); // bx     lr
+}
+#pragma warning(pop)
+#else
+#error lrint needed for MSVC on non X86/AMD64/ARM targets.
+#endif
+#endif
+
+/*!
+\brief Draw anti-aliased ellipse with blending.
+
+Note: Based on code from Anders Lindstroem, which is based on code from sge library, 
+which is based on code from TwinLib.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the aa-ellipse.
+\param y Y coordinate of the center of the aa-ellipse.
+\param rx Horizontal radius in pixels of the aa-ellipse.
+\param ry Vertical radius in pixels of the aa-ellipse.
+\param color The color value of the aa-ellipse to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aaellipseColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	Sint16 x1,y1,x2,y2;
+	int i;
+	int a2, b2, ds, dt, dxt, t, s, d;
+	Sint16 xp, yp, xs, ys, dyt, od, xx, yy, xc2, yc2;
+	float cp;
+	double sab;
+	Uint8 weight, iweight;
+	int result;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Sanity check radii 
+	*/
+	if ((rx < 0) || (ry < 0)) {
+		return (-1);
+	}
+
+	/*
+	* Special case for rx=0 - draw a vline 
+	*/
+	if (rx == 0) {
+		return (vlineColor(dst, x, y - ry, y + ry, color));
+	}
+	/*
+	* Special case for ry=0 - draw an hline 
+	*/
+	if (ry == 0) {
+		return (hlineColor(dst, x - rx, x + rx, y, color));
+	}
+
+	/*
+	* Get circle and clipping boundary and 
+	* test if bounding box of circle is visible 
+	*/
+	x2 = x + rx;
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	} 
+	x1 = x - rx;
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	} 
+	y2 = y + ry;
+	top = dst->clip_rect.y;
+	if (y2<top) {
+		return(0);
+	} 
+	y1 = y - ry;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if (y1>bottom) {
+		return(0);
+	} 
+
+	/* Variable setup */
+	a2 = rx * rx;
+	b2 = ry * ry;
+
+	ds = 2 * a2;
+	dt = 2 * b2;
+
+	xc2 = 2 * x;
+	yc2 = 2 * y;
+
+	sab = sqrt((double)(a2 + b2));
+	od = (Sint16)lrint(sab*0.01) + 1; /* introduce some overdraw */
+	dxt = (Sint16)lrint((double)a2 / sab) + od;
+
+	t = 0;
+	s = -2 * a2 * ry;
+	d = 0;
+
+	xp = x;
+	yp = y - ry;
+
+	/* Lock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		if (SDL_LockSurface(dst) < 0) {
+			return (-1);
+		}
+	}
+
+	/* Draw */
+	result = 0;
+
+	/* "End points" */
+	result |= pixelColorNolock(dst, xp, yp, color);
+	result |= pixelColorNolock(dst, xc2 - xp, yp, color);
+	result |= pixelColorNolock(dst, xp, yc2 - yp, color);
+	result |= pixelColorNolock(dst, xc2 - xp, yc2 - yp, color);
+
+	for (i = 1; i <= dxt; i++) {
+		xp--;
+		d += t - b2;
+
+		if (d >= 0)
+			ys = yp - 1;
+		else if ((d - s - a2) > 0) {
+			if ((2 * d - s - a2) >= 0)
+				ys = yp + 1;
+			else {
+				ys = yp;
+				yp++;
+				d -= s + a2;
+				s += ds;
+			}
+		} else {
+			yp++;
+			ys = yp + 1;
+			d -= s + a2;
+			s += ds;
+		}
+
+		t -= dt;
+
+		/* Calculate alpha */
+		if (s != 0) {
+			cp = (float) abs(d) / (float) abs(s);
+			if (cp > 1.0) {
+				cp = 1.0;
+			}
+		} else {
+			cp = 1.0;
+		}
+
+		/* Calculate weights */
+		weight = (Uint8) (cp * 255);
+		iweight = 255 - weight;
+
+		/* Upper half */
+		xx = xc2 - xp;
+		result |= pixelColorWeightNolock(dst, xp, yp, color, iweight);
+		result |= pixelColorWeightNolock(dst, xx, yp, color, iweight);
+
+		result |= pixelColorWeightNolock(dst, xp, ys, color, weight);
+		result |= pixelColorWeightNolock(dst, xx, ys, color, weight);
+
+		/* Lower half */
+		yy = yc2 - yp;
+		result |= pixelColorWeightNolock(dst, xp, yy, color, iweight);
+		result |= pixelColorWeightNolock(dst, xx, yy, color, iweight);
+
+		yy = yc2 - ys;
+		result |= pixelColorWeightNolock(dst, xp, yy, color, weight);
+		result |= pixelColorWeightNolock(dst, xx, yy, color, weight);
+	}
+
+	/* Replaces original approximation code dyt = abs(yp - yc); */
+	dyt = (Sint16)lrint((double)b2 / sab ) + od;    
+
+	for (i = 1; i <= dyt; i++) {
+		yp++;
+		d -= s + a2;
+
+		if (d <= 0)
+			xs = xp + 1;
+		else if ((d + t - b2) < 0) {
+			if ((2 * d + t - b2) <= 0)
+				xs = xp - 1;
+			else {
+				xs = xp;
+				xp--;
+				d += t - b2;
+				t -= dt;
+			}
+		} else {
+			xp--;
+			xs = xp - 1;
+			d += t - b2;
+			t -= dt;
+		}
+
+		s += ds;
+
+		/* Calculate alpha */
+		if (t != 0) {
+			cp = (float) abs(d) / (float) abs(t);
+			if (cp > 1.0) {
+				cp = 1.0;
+			}
+		} else {
+			cp = 1.0;
+		}
+
+		/* Calculate weight */
+		weight = (Uint8) (cp * 255);
+		iweight = 255 - weight;
+
+		/* Left half */
+		xx = xc2 - xp;
+		yy = yc2 - yp;
+		result |= pixelColorWeightNolock(dst, xp, yp, color, iweight);
+		result |= pixelColorWeightNolock(dst, xx, yp, color, iweight);
+
+		result |= pixelColorWeightNolock(dst, xp, yy, color, iweight);
+		result |= pixelColorWeightNolock(dst, xx, yy, color, iweight);
+
+		/* Right half */
+		xx = xc2 - xs;
+		result |= pixelColorWeightNolock(dst, xs, yp, color, weight);
+		result |= pixelColorWeightNolock(dst, xx, yp, color, weight);
+
+		result |= pixelColorWeightNolock(dst, xs, yy, color, weight);
+		result |= pixelColorWeightNolock(dst, xx, yy, color, weight);
+
+	}
+
+	/* Unlock surface */
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw anti-aliased ellipse with blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the aa-ellipse.
+\param y Y coordinate of the center of the aa-ellipse.
+\param rx Horizontal radius in pixels of the aa-ellipse.
+\param ry Vertical radius in pixels of the aa-ellipse.
+\param r The red value of the aa-ellipse to draw. 
+\param g The green value of the aa-ellipse to draw. 
+\param b The blue value of the aa-ellipse to draw. 
+\param a The alpha value of the aa-ellipse to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aaellipseRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (aaellipseColor
+		(dst, x, y, rx, ry, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ---- Filled Ellipse */
+
+/* Note: */
+/* Based on algorithm from sge library with multiple-hline draw removal */
+/* and other speedup changes. */
+
+/*!
+\brief Draw filled ellipse with blending.
+
+Note: Based on algorithm from sge library with multiple-hline draw removal
+and other speedup changes.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the filled ellipse.
+\param y Y coordinate of the center of the filled ellipse.
+\param rx Horizontal radius in pixels of the filled ellipse.
+\param ry Vertical radius in pixels of the filled ellipse.
+\param color The color value of the filled ellipse to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledEllipseColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	int result;
+	Sint16 x1, y1, x2, y2;
+	int ix, iy;
+	int h, i, j, k;
+	int oh, oi, oj, ok;
+	int xmh, xph;
+	int xmi, xpi;
+	int xmj, xpj;
+	int xmk, xpk;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Sanity check radii 
+	*/
+	if ((rx < 0) || (ry < 0)) {
+		return (-1);
+	}
+
+	/*
+	* Special case for rx=0 - draw a vline 
+	*/
+	if (rx == 0) {
+		return (vlineColor(dst, x, y - ry, y + ry, color));
+	}
+	/*
+	* Special case for ry=0 - draw a hline 
+	*/
+	if (ry == 0) {
+		return (hlineColor(dst, x - rx, x + rx, y, color));
+	}
+
+	/*
+	* Get circle and clipping boundary and 
+	* test if bounding box of circle is visible 
+	*/
+	x2 = x + rx;
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	} 
+	x1 = x - rx;
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	} 
+	y2 = y + ry;
+	top = dst->clip_rect.y;
+	if (y2<top) {
+		return(0);
+	} 
+	y1 = y - ry;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if (y1>bottom) {
+		return(0);
+	} 
+
+	/*
+	* Init vars 
+	*/
+	oh = oi = oj = ok = 0xFFFF;
+
+	/*
+	* Draw 
+	*/
+	result = 0;
+	if (rx > ry) {
+		ix = 0;
+		iy = rx * 64;
+
+		do {
+			h = (ix + 32) >> 6;
+			i = (iy + 32) >> 6;
+			j = (h * ry) / rx;
+			k = (i * ry) / rx;
+
+			if ((ok != k) && (oj != k)) {
+				xph = x + h;
+				xmh = x - h;
+				if (k > 0) {
+					result |= hlineColor(dst, xmh, xph, y + k, color);
+					result |= hlineColor(dst, xmh, xph, y - k, color);
+				} else {
+					result |= hlineColor(dst, xmh, xph, y, color);
+				}
+				ok = k;
+			}
+			if ((oj != j) && (ok != j) && (k != j)) {
+				xmi = x - i;
+				xpi = x + i;
+				if (j > 0) {
+					result |= hlineColor(dst, xmi, xpi, y + j, color);
+					result |= hlineColor(dst, xmi, xpi, y - j, color);
+				} else {
+					result |= hlineColor(dst, xmi, xpi, y, color);
+				}
+				oj = j;
+			}
+
+			ix = ix + iy / rx;
+			iy = iy - ix / rx;
+
+		} while (i > h);
+	} else {
+		ix = 0;
+		iy = ry * 64;
+
+		do {
+			h = (ix + 32) >> 6;
+			i = (iy + 32) >> 6;
+			j = (h * rx) / ry;
+			k = (i * rx) / ry;
+
+			if ((oi != i) && (oh != i)) {
+				xmj = x - j;
+				xpj = x + j;
+				if (i > 0) {
+					result |= hlineColor(dst, xmj, xpj, y + i, color);
+					result |= hlineColor(dst, xmj, xpj, y - i, color);
+				} else {
+					result |= hlineColor(dst, xmj, xpj, y, color);
+				}
+				oi = i;
+			}
+			if ((oh != h) && (oi != h) && (i != h)) {
+				xmk = x - k;
+				xpk = x + k;
+				if (h > 0) {
+					result |= hlineColor(dst, xmk, xpk, y + h, color);
+					result |= hlineColor(dst, xmk, xpk, y - h, color);
+				} else {
+					result |= hlineColor(dst, xmk, xpk, y, color);
+				}
+				oh = h;
+			}
+
+			ix = ix + iy / ry;
+			iy = iy - ix / ry;
+
+		} while (i > h);
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw filled ellipse with blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the filled ellipse.
+\param y Y coordinate of the center of the filled ellipse.
+\param rx Horizontal radius in pixels of the filled ellipse.
+\param ry Vertical radius in pixels of the filled ellipse.
+\param r The red value of the filled ellipse to draw. 
+\param g The green value of the filled ellipse to draw. 
+\param b The blue value of the filled ellipse to draw. 
+\param a The alpha value of the filled ellipse to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledEllipseRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rx, Sint16 ry, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (filledEllipseColor
+		(dst, x, y, rx, ry, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ----- pie */
+
+/*!
+\brief Internal float (low-speed) pie-calc implementation by drawing polygons.
+
+Note: Determines vertex array and uses polygon or filledPolygon drawing routines to render.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the pie.
+\param y Y coordinate of the center of the pie.
+\param rad Radius in pixels of the pie.
+\param start Starting radius in degrees of the pie.
+\param end Ending radius in degrees of the pie.
+\param color The color value of the pie to draw (0xRRGGBBAA). 
+\param filled Flag indicating if the pie should be filled (=1) or not (=0).
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _pieColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color, Uint8 filled)
+{
+	Sint16 left, right, top, bottom;
+	Sint16 x1, y1, x2, y2;
+	int result;
+	double angle, start_angle, end_angle;
+	double deltaAngle;
+	double dr;
+	int numpoints, i;
+	Sint16 *vx, *vy;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Sanity check radii 
+	*/
+	if (rad < 0) {
+		return (-1);
+	}
+
+	/*
+	* Fixup angles
+	*/
+	start = start % 360;
+	end = end % 360;
+
+	/*
+	* Special case for rad=0 - draw a point 
+	*/
+	if (rad == 0) {
+		return (pixelColor(dst, x, y, color));
+	}
+
+	/*
+	* Clip against circle, not pie (not 100% optimal).
+	* Get pie's circle and clipping boundary and 
+	* test if bounding box of circle is visible
+	*/
+	x2 = x + rad;
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	} 
+	x1 = x - rad;
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	} 
+	y2 = y + rad;
+	top = dst->clip_rect.y;
+	if (y2<top) {
+		return(0);
+	} 
+	y1 = y - rad;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if (y1>bottom) {
+		return(0);
+	} 
+
+	/*
+	* Variable setup 
+	*/
+	dr = (double) rad;
+	deltaAngle = 3.0 / dr;
+	start_angle = (double) start *(2.0 * M_PI / 360.0);
+	end_angle = (double) end *(2.0 * M_PI / 360.0);
+	if (start > end) {
+		end_angle += (2.0 * M_PI);
+	}
+
+	/* We will always have at least 2 points */
+	numpoints = 2;
+
+	/* Count points (rather than calculating it) */
+	angle = start_angle;
+	while (angle < end_angle) {
+		angle += deltaAngle;
+		numpoints++;
+	}
+
+	/* Allocate combined vertex array */
+	vx = vy = (Sint16 *) malloc(2 * sizeof(Uint16) * numpoints);
+	if (vx == NULL) {
+		return (-1);
+	}
+
+	/* Update point to start of vy */
+	vy += numpoints;
+
+	/* Center */
+	vx[0] = x;
+	vy[0] = y;
+
+	/* First vertex */
+	angle = start_angle;
+	vx[1] = x + (int) (dr * cos(angle));
+	vy[1] = y + (int) (dr * sin(angle));
+
+	if (numpoints<3)
+	{
+		result = lineColor(dst, vx[0], vy[0], vx[1], vy[1], color);
+	}
+	else
+	{
+		/* Calculate other vertices */
+		i = 2;
+		angle = start_angle;
+		while (angle < end_angle) {
+			angle += deltaAngle;
+			if (angle>end_angle)
+			{
+				angle = end_angle;
+			}
+			vx[i] = x + (int) (dr * cos(angle));
+			vy[i] = y + (int) (dr * sin(angle));
+			i++;
+		}
+
+		/* Draw */
+		if (filled) {
+			result = filledPolygonColor(dst, vx, vy, numpoints, color);
+		} else {
+			result = polygonColor(dst, vx, vy, numpoints, color);
+		}
+	}
+
+	/* Free combined vertex array */
+	free(vx);
+
+	return (result);
+}
+
+/*!
+\brief Draw pie (outline) with alpha blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the pie.
+\param y Y coordinate of the center of the pie.
+\param rad Radius in pixels of the pie.
+\param start Starting radius in degrees of the pie.
+\param end Ending radius in degrees of the pie.
+\param color The color value of the pie to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pieColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, 
+	Sint16 start, Sint16 end, Uint32 color) 
+{
+	return (_pieColor(dst, x, y, rad, start, end, color, 0));
+
+}
+
+/*!
+\brief Draw pie (outline) with alpha blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the pie.
+\param y Y coordinate of the center of the pie.
+\param rad Radius in pixels of the pie.
+\param start Starting radius in degrees of the pie.
+\param end Ending radius in degrees of the pie.
+\param r The red value of the pie to draw. 
+\param g The green value of the pie to draw. 
+\param b The blue value of the pie to draw. 
+\param a The alpha value of the pie to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int pieRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad,
+	Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return (_pieColor(dst, x, y, rad, start, end,
+		((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a, 0));
+
+}
+
+/*!
+\brief Draw filled pie with alpha blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the filled pie.
+\param y Y coordinate of the center of the filled pie.
+\param rad Radius in pixels of the filled pie.
+\param start Starting radius in degrees of the filled pie.
+\param end Ending radius in degrees of the filled pie.
+\param color The color value of the filled pie to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPieColor(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad, Sint16 start, Sint16 end, Uint32 color)
+{
+	return (_pieColor(dst, x, y, rad, start, end, color, 1));
+}
+
+/*!
+\brief Draw filled pie with alpha blending.
+
+\param dst The surface to draw on.
+\param x X coordinate of the center of the filled pie.
+\param y Y coordinate of the center of the filled pie.
+\param rad Radius in pixels of the filled pie.
+\param start Starting radius in degrees of the filled pie.
+\param end Ending radius in degrees of the filled pie.
+\param r The red value of the filled pie to draw. 
+\param g The green value of the filled pie to draw. 
+\param b The blue value of the filled pie to draw. 
+\param a The alpha value of the filled pie to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPieRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, Sint16 rad,
+	Sint16 start, Sint16 end, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return (_pieColor(dst, x, y, rad, start, end,
+		((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a, 1));
+}
+
+/* ------ Trigon */
+
+/*!
+\brief Draw trigon (triangle outline) with alpha blending.
+
+Note: Creates vertex array and uses polygon routine to render.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the trigon.
+\param y1 Y coordinate of the first point of the trigon.
+\param x2 X coordinate of the second point of the trigon.
+\param y2 Y coordinate of the second point of the trigon.
+\param x3 X coordinate of the third point of the trigon.
+\param y3 Y coordinate of the third point of the trigon.
+\param color The color value of the trigon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int trigonColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(polygonColor(dst,vx,vy,3,color));
+}
+
+/*!
+\brief Draw trigon (triangle outline) with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the trigon.
+\param y1 Y coordinate of the first point of the trigon.
+\param x2 X coordinate of the second point of the trigon.
+\param y2 Y coordinate of the second point of the trigon.
+\param x3 X coordinate of the third point of the trigon.
+\param y3 Y coordinate of the third point of the trigon.
+\param r The red value of the trigon to draw. 
+\param g The green value of the trigon to draw. 
+\param b The blue value of the trigon to draw. 
+\param a The alpha value of the trigon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int trigonRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+	Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(polygonRGBA(dst,vx,vy,3,r,g,b,a));
+}				 
+
+/* ------ AA-Trigon */
+
+/*!
+\brief Draw anti-aliased trigon (triangle outline) with alpha blending.
+
+Note: Creates vertex array and uses aapolygon routine to render.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the aa-trigon.
+\param y1 Y coordinate of the first point of the aa-trigon.
+\param x2 X coordinate of the second point of the aa-trigon.
+\param y2 Y coordinate of the second point of the aa-trigon.
+\param x3 X coordinate of the third point of the aa-trigon.
+\param y3 Y coordinate of the third point of the aa-trigon.
+\param color The color value of the aa-trigon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aatrigonColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(aapolygonColor(dst,vx,vy,3,color));
+}
+
+/*!
+\brief Draw anti-aliased trigon (triangle outline) with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the aa-trigon.
+\param y1 Y coordinate of the first point of the aa-trigon.
+\param x2 X coordinate of the second point of the aa-trigon.
+\param y2 Y coordinate of the second point of the aa-trigon.
+\param x3 X coordinate of the third point of the aa-trigon.
+\param y3 Y coordinate of the third point of the aa-trigon.
+\param r The red value of the aa-trigon to draw. 
+\param g The green value of the aa-trigon to draw. 
+\param b The blue value of the aa-trigon to draw. 
+\param a The alpha value of the aa-trigon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aatrigonRGBA(SDL_Surface * dst,  Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+	Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(aapolygonRGBA(dst,vx,vy,3,r,g,b,a));
+}				   
+
+/* ------ Filled Trigon */
+
+/*!
+\brief Draw filled trigon (triangle) with alpha blending.
+
+Note: Creates vertex array and uses aapolygon routine to render.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the filled trigon.
+\param y1 Y coordinate of the first point of the filled trigon.
+\param x2 X coordinate of the second point of the filled trigon.
+\param y2 Y coordinate of the second point of the filled trigon.
+\param x3 X coordinate of the third point of the filled trigon.
+\param y3 Y coordinate of the third point of the filled trigon.
+\param color The color value of the filled trigon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledTrigonColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3, Uint32 color)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(filledPolygonColor(dst,vx,vy,3,color));
+}
+
+/*!
+\brief Draw filled trigon (triangle) with alpha blending.
+
+Note: Creates vertex array and uses aapolygon routine to render.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the filled trigon.
+\param y1 Y coordinate of the first point of the filled trigon.
+\param x2 X coordinate of the second point of the filled trigon.
+\param y2 Y coordinate of the second point of the filled trigon.
+\param x3 X coordinate of the third point of the filled trigon.
+\param y3 Y coordinate of the third point of the filled trigon.
+\param r The red value of the filled trigon to draw. 
+\param g The green value of the filled trigon to draw. 
+\param b The blue value of the filled trigon to draw. 
+\param a The alpha value of the filled trigon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledTrigonRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Sint16 x3, Sint16 y3,
+	Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	Sint16 vx[3]; 
+	Sint16 vy[3];
+
+	vx[0]=x1;
+	vx[1]=x2;
+	vx[2]=x3;
+	vy[0]=y1;
+	vy[1]=y2;
+	vy[2]=y3;
+
+	return(filledPolygonRGBA(dst,vx,vy,3,r,g,b,a));
+}
+
+/* ---- Polygon */
+
+/*!
+\brief Draw polygon with alpha blending.
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the polygon.
+\param vy Vertex array containing Y coordinates of the points of the polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the polygon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int polygonColor(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color)
+{
+	int result;
+	int i;
+	const Sint16 *x1, *y1, *x2, *y2;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (-1);
+	}
+	if (vy == NULL) {
+		return (-1);
+	}
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (-1);
+	}
+
+	/*
+	* Pointer setup 
+	*/
+	x1 = x2 = vx;
+	y1 = y2 = vy;
+	x2++;
+	y2++;
+
+	/*
+	* Draw 
+	*/
+	result = 0;
+	for (i = 1; i < n; i++) {
+		result |= lineColor(dst, *x1, *y1, *x2, *y2, color);
+		x1 = x2;
+		y1 = y2;
+		x2++;
+		y2++;
+	}
+	result |= lineColor(dst, *x1, *y1, *vx, *vy, color);
+
+	return (result);
+}
+
+/*!
+\brief Draw polygon with alpha blending.
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the polygon.
+\param vy Vertex array containing Y coordinates of the points of the polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the polygon to draw. 
+\param g The green value of the polygon to draw. 
+\param b The blue value of the polygon to draw. 
+\param a The alpha value of the polygon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int polygonRGBA(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (polygonColor(dst, vx, vy, n, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ---- AA-Polygon */
+
+/*!
+\brief Draw anti-aliased polygon with alpha blending.
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the aa-polygon.
+\param vy Vertex array containing Y coordinates of the points of the aa-polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the aa-polygon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aapolygonColor(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color)
+{
+	int result;
+	int i;
+	const Sint16 *x1, *y1, *x2, *y2;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (-1);
+	}
+	if (vy == NULL) {
+		return (-1);
+	}
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (-1);
+	}
+
+	/*
+	* Pointer setup 
+	*/
+	x1 = x2 = vx;
+	y1 = y2 = vy;
+	x2++;
+	y2++;
+
+	/*
+	* Draw 
+	*/
+	result = 0;
+	for (i = 1; i < n; i++) {
+		result |= _aalineColor(dst, *x1, *y1, *x2, *y2, color, 0);
+		x1 = x2;
+		y1 = y2;
+		x2++;
+		y2++;
+	}
+	result |= _aalineColor(dst, *x1, *y1, *vx, *vy, color, 0);
+
+	return (result);
+}
+
+/*!
+\brief Draw anti-aliased polygon with alpha blending.
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the aa-polygon.
+\param vy Vertex array containing Y coordinates of the points of the aa-polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the aa-polygon to draw. 
+\param g The green value of the aa-polygon to draw. 
+\param b The blue value of the aa-polygon to draw. 
+\param a The alpha value of the aa-polygon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int aapolygonRGBA(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (aapolygonColor(dst, vx, vy, n, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ---- Filled Polygon */
+
+/*!
+\brief Internal helper qsort callback functions used in filled polygon drawing.
+
+\param a The surface to draw on.
+\param b Vertex array containing X coordinates of the points of the polygon.
+
+\returns Returns 0 if a==b, a negative number if a<b or a positive number if a>b.
+*/
+int _gfxPrimitivesCompareInt(const void *a, const void *b)
+{
+	return (*(const int *) a) - (*(const int *) b);
+}
+
+/*!
+\brief Global vertex array to use if optional parameters are not given in filledPolygonMT calls.
+
+Note: Used for non-multithreaded (default) operation of filledPolygonMT.
+*/
+static int *gfxPrimitivesPolyIntsGlobal = NULL;
+
+/*!
+\brief Flag indicating if global vertex array was already allocated.
+
+Note: Used for non-multithreaded (default) operation of filledPolygonMT.
+*/
+static int gfxPrimitivesPolyAllocatedGlobal = 0;
+
+/*!
+\brief Draw filled polygon with alpha blending (multi-threaded capable).
+
+Note: The last two parameters are optional; but are required for multithreaded operation.  
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the filled polygon to draw (0xRRGGBBAA). 
+\param polyInts Preallocated, temporary vertex array used for sorting vertices. Required for multithreaded operation; set to NULL otherwise.
+\param polyAllocated Flag indicating if temporary vertex array was allocated. Required for multithreaded operation; set to NULL otherwise.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPolygonColorMT(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color, int **polyInts, int *polyAllocated)
+{
+	int result;
+	int i;
+	int y, xa, xb;
+	int miny, maxy;
+	int x1, y1;
+	int x2, y2;
+	int ind1, ind2;
+	int ints;
+	int *gfxPrimitivesPolyInts = NULL;
+	int *gfxPrimitivesPolyIntsNew = NULL;
+	int gfxPrimitivesPolyAllocated = 0;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Vertex array NULL check 
+	*/
+	if (vx == NULL) {
+		return (-1);
+	}
+	if (vy == NULL) {
+		return (-1);
+	}
+
+	/*
+	* Sanity check number of edges
+	*/
+	if (n < 3) {
+		return -1;
+	}
+
+	/*
+	* Map polygon cache  
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) {
+		/* Use global cache */
+		gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsGlobal;
+		gfxPrimitivesPolyAllocated = gfxPrimitivesPolyAllocatedGlobal;
+	} else {
+		/* Use local cache */
+		gfxPrimitivesPolyInts = *polyInts;
+		gfxPrimitivesPolyAllocated = *polyAllocated;
+	}
+
+	/*
+	* Allocate temp array, only grow array 
+	*/
+	if (!gfxPrimitivesPolyAllocated) {
+		gfxPrimitivesPolyInts = (int *) malloc(sizeof(int) * n);
+		gfxPrimitivesPolyAllocated = n;
+	} else {
+		if (gfxPrimitivesPolyAllocated < n) {
+			gfxPrimitivesPolyIntsNew = (int *) realloc(gfxPrimitivesPolyInts, sizeof(int) * n);
+			if (!gfxPrimitivesPolyIntsNew) {
+				if (!gfxPrimitivesPolyInts) {
+					free(gfxPrimitivesPolyInts);
+					gfxPrimitivesPolyInts = NULL;
+				}
+				gfxPrimitivesPolyAllocated = 0;
+			} else {
+				gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsNew;
+				gfxPrimitivesPolyAllocated = n;
+			}
+		}
+	}
+
+	/*
+	* Check temp array
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		gfxPrimitivesPolyAllocated = 0;
+	}
+
+	/*
+	* Update cache variables
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) { 
+		gfxPrimitivesPolyIntsGlobal =  gfxPrimitivesPolyInts;
+		gfxPrimitivesPolyAllocatedGlobal = gfxPrimitivesPolyAllocated;
+	} else {
+		*polyInts = gfxPrimitivesPolyInts;
+		*polyAllocated = gfxPrimitivesPolyAllocated;
+	}
+
+	/*
+	* Check temp array again
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		return(-1);
+	}
+
+	/*
+	* Determine Y maxima 
+	*/
+	miny = vy[0];
+	maxy = vy[0];
+	for (i = 1; (i < n); i++) {
+		if (vy[i] < miny) {
+			miny = vy[i];
+		} else if (vy[i] > maxy) {
+			maxy = vy[i];
+		}
+	}
+
+	/*
+	* Draw, scanning y 
+	*/
+	result = 0;
+	for (y = miny; (y <= maxy); y++) {
+		ints = 0;
+		for (i = 0; (i < n); i++) {
+			if (!i) {
+				ind1 = n - 1;
+				ind2 = 0;
+			} else {
+				ind1 = i - 1;
+				ind2 = i;
+			}
+			y1 = vy[ind1];
+			y2 = vy[ind2];
+			if (y1 < y2) {
+				x1 = vx[ind1];
+				x2 = vx[ind2];
+			} else if (y1 > y2) {
+				y2 = vy[ind1];
+				y1 = vy[ind2];
+				x2 = vx[ind1];
+				x1 = vx[ind2];
+			} else {
+				continue;
+			}
+			if ( ((y >= y1) && (y < y2)) || ((y == maxy) && (y > y1) && (y <= y2)) ) {
+				gfxPrimitivesPolyInts[ints++] = ((65536 * (y - y1)) / (y2 - y1)) * (x2 - x1) + (65536 * x1);
+			} 	    
+		}
+
+		qsort(gfxPrimitivesPolyInts, ints, sizeof(int), _gfxPrimitivesCompareInt);
+
+		for (i = 0; (i < ints); i += 2) {
+			xa = gfxPrimitivesPolyInts[i] + 1;
+			xa = (xa >> 16) + ((xa & 32768) >> 15);
+			xb = gfxPrimitivesPolyInts[i+1] - 1;
+			xb = (xb >> 16) + ((xb & 32768) >> 15);
+			result |= hlineColor(dst, xa, xb, y, color);
+		}
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw filled polygon with alpha blending (multi-threaded capable).
+
+Note: The last two parameters are optional; but are required for multithreaded operation.  
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the filled polygon to draw. 
+\param g The green value of the filled polygon to draw. 
+\param b The blue value of the filed polygon to draw. 
+\param a The alpha value of the filled polygon to draw.
+\param polyInts Preallocated, temporary vertex array used for sorting vertices. Required for multithreaded operation; set to NULL otherwise.
+\param polyAllocated Flag indicating if temporary vertex array was allocated. Required for multithreaded operation; set to NULL otherwise.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPolygonRGBAMT(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a, int **polyInts, int *polyAllocated)
+{
+	/*
+	* Draw 
+	*/
+	return (filledPolygonColorMT(dst, vx, vy, n, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a, polyInts, polyAllocated));
+}
+
+/*!
+\brief Draw filled polygon with alpha blending.
+
+Note: Standard filledPolygon function is calling multithreaded version with NULL parameters
+to use the global vertex cache.
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param color The color value of the filled polygon to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPolygonColor(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint32 color)
+{
+	/*
+	* Draw 
+	*/
+	return (filledPolygonColorMT(dst, vx, vy, n, color, NULL, NULL));
+}
+
+/*!
+\brief Draw filled polygon with alpha blending.
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the filled polygon.
+\param vy Vertex array containing Y coordinates of the points of the filled polygon.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param r The red value of the filled polygon to draw. 
+\param g The green value of the filled polygon to draw. 
+\param b The blue value of the filed polygon to draw. 
+\param a The alpha value of the filled polygon to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int filledPolygonRGBA(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (filledPolygonColorMT(dst, vx, vy, n, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a, NULL, NULL));
+}
+
+/*!
+\brief Internal function to draw a textured horizontal line.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point (i.e. left) of the line.
+\param x2 X coordinate of the second point (i.e. right) of the line.
+\param y Y coordinate of the points of the line.
+\param texture The texture surface to retrieve color information from.
+\param texture_dx The X offset for the texture lookup.
+\param texture_dy The Y offset for the textured lookup.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _HLineTextured(SDL_Surface * dst, Sint16 x1, Sint16 x2, Sint16 y, SDL_Surface *texture, int texture_dx, int texture_dy)
+{
+	Sint16 left, right, top, bottom;
+	Sint16 w;
+	Sint16 xtmp;
+	int result = 0;
+	int texture_x_walker;    
+	int texture_y_start;    
+	SDL_Rect source_rect,dst_rect;
+	int pixels_written,write_width;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Swap x1, x2 if required to ensure x1<=x2
+	*/
+	if (x1 > x2) {
+		xtmp = x1;
+		x1 = x2;
+		x2 = xtmp;
+	}
+
+	/*
+	* Get clipping boundary and
+	* check visibility of hline 
+	*/
+	left = dst->clip_rect.x;
+	if (x2<left) {
+		return(0);
+	}
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	if (x1>right) {
+		return(0);
+	}
+	top = dst->clip_rect.y;
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	if ((y<top) || (y>bottom)) {
+		return (0);
+	}
+
+	/*
+	* Clip x 
+	*/
+	if (x1 < left) {
+		x1 = left;
+	}
+	if (x2 > right) {
+		x2 = right;
+	}
+
+	/*
+	* Calculate width to draw
+	*/
+	w = x2 - x1 + 1;
+
+	/*
+	* Determine where in the texture we start drawing
+	*/
+	texture_x_walker =   (x1 - texture_dx)  % texture->w;
+	if (texture_x_walker < 0){
+		texture_x_walker = texture->w + texture_x_walker ;
+	}
+
+	texture_y_start = (y + texture_dy) % texture->h;
+	if (texture_y_start < 0){
+		texture_y_start = texture->h + texture_y_start;
+	}
+
+	// setup the source rectangle; we are only drawing one horizontal line
+	source_rect.y = texture_y_start;
+	source_rect.x = texture_x_walker;
+	source_rect.h = 1;
+
+	// we will draw to the current y
+	dst_rect.y = y;
+
+	// if there are enough pixels left in the current row of the texture
+	// draw it all at once
+	if (w <= texture->w -texture_x_walker){
+		source_rect.w = w;
+		source_rect.x = texture_x_walker;
+		dst_rect.x= x1;
+		result = (SDL_BlitSurface  (texture, &source_rect , dst, &dst_rect) == 0);
+	} else { // we need to draw multiple times
+		// draw the first segment
+		pixels_written = texture->w  - texture_x_walker;
+		source_rect.w = pixels_written;
+		source_rect.x = texture_x_walker;
+		dst_rect.x= x1;
+		result |= (SDL_BlitSurface (texture, &source_rect , dst, &dst_rect) == 0);
+		write_width = texture->w;
+
+		// now draw the rest
+		// set the source x to 0
+		source_rect.x = 0;
+		while (pixels_written < w){
+			if (write_width >= w - pixels_written) {
+				write_width =  w - pixels_written;
+			}
+			source_rect.w = write_width;
+			dst_rect.x = x1 + pixels_written;
+			result  |= (SDL_BlitSurface  (texture,&source_rect , dst, &dst_rect) == 0);
+			pixels_written += write_width;
+		}
+	}
+
+	return result;
+}
+
+/*!
+\brief Draws a polygon filled with the given texture (Multi-Threading Capable). 
+
+This operation use internally SDL_BlitSurface for lines of the source texture. It supports
+alpha drawing.
+
+To get the best performance of this operation you need to make sure the texture and the dst surface have the same format
+(see  http://docs.mandragor.org/files/Common_libs_documentation/SDL/SDL_Documentation_project_en/sdlblitsurface.html).
+The last two parameters are optional, but required for multithreaded operation. When set to NULL, uses global static temp array.
+
+\param dst the destination surface, 
+\param vx array of x vector components
+\param vy array of x vector components
+\param n the amount of vectors in the vx and vy array
+\param texture the sdl surface to use to fill the polygon
+\param texture_dx the offset of the texture relative to the screeen. if you move the polygon 10 pixels 
+to the left and want the texture to apear the same you need to increase the texture_dx value
+\param texture_dy see texture_dx
+\param polyInts preallocated temp array storage for vertex sorting (used for multi-threaded operation)
+\param polyAllocated flag indicating oif the temp array was allocated (used for multi-threaded operation)
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int texturedPolygonMT(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, 
+	SDL_Surface * texture, int texture_dx, int texture_dy, int **polyInts, int *polyAllocated)
+{
+	int result;
+	int i;
+	int y, xa, xb;
+	int minx,maxx,miny, maxy;
+	int x1, y1;
+	int x2, y2;
+	int ind1, ind2;
+	int ints;
+	int *gfxPrimitivesPolyInts = NULL;
+	int gfxPrimitivesPolyAllocated = 0;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Sanity check number of edges
+	*/
+	if (n < 3) {
+		return -1;
+	}
+
+	/*
+	* Map polygon cache  
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) {
+		/* Use global cache */
+		gfxPrimitivesPolyInts = gfxPrimitivesPolyIntsGlobal;
+		gfxPrimitivesPolyAllocated = gfxPrimitivesPolyAllocatedGlobal;
+	} else {
+		/* Use local cache */
+		gfxPrimitivesPolyInts = *polyInts;
+		gfxPrimitivesPolyAllocated = *polyAllocated;
+	}
+
+	/*
+	* Allocate temp array, only grow array 
+	*/
+	if (!gfxPrimitivesPolyAllocated) {
+		gfxPrimitivesPolyInts = (int *) malloc(sizeof(int) * n);
+		gfxPrimitivesPolyAllocated = n;
+	} else {
+		if (gfxPrimitivesPolyAllocated < n) {
+			gfxPrimitivesPolyInts = (int *) realloc(gfxPrimitivesPolyInts, sizeof(int) * n);
+			gfxPrimitivesPolyAllocated = n;
+		}
+	}
+
+	/*
+	* Check temp array
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		gfxPrimitivesPolyAllocated = 0;
+	}
+
+	/*
+	* Update cache variables
+	*/
+	if ((polyInts==NULL) || (polyAllocated==NULL)) { 
+		gfxPrimitivesPolyIntsGlobal =  gfxPrimitivesPolyInts;
+		gfxPrimitivesPolyAllocatedGlobal = gfxPrimitivesPolyAllocated;
+	} else {
+		*polyInts = gfxPrimitivesPolyInts;
+		*polyAllocated = gfxPrimitivesPolyAllocated;
+	}
+
+	/*
+	* Check temp array again
+	*/
+	if (gfxPrimitivesPolyInts==NULL) {        
+		return(-1);
+	}
+
+	/*
+	* Determine X,Y minima,maxima 
+	*/
+	miny = vy[0];
+	maxy = vy[0];
+	minx = vx[0];
+	maxx = vx[0];
+	for (i = 1; (i < n); i++) {
+		if (vy[i] < miny) {
+			miny = vy[i];
+		} else if (vy[i] > maxy) {
+			maxy = vy[i];
+		}
+		if (vx[i] < minx) {
+			minx = vx[i];
+		} else if (vx[i] > maxx) {
+			maxx = vx[i];
+		}
+	}
+	if (maxx <0 || minx > dst->w){
+		return -1;
+	}
+	if (maxy <0 || miny > dst->h){
+		return -1;
+	}
+
+	/*
+	* Draw, scanning y 
+	*/
+	result = 0;
+	for (y = miny; (y <= maxy); y++) {
+		ints = 0;
+		for (i = 0; (i < n); i++) {
+			if (!i) {
+				ind1 = n - 1;
+				ind2 = 0;
+			} else {
+				ind1 = i - 1;
+				ind2 = i;
+			}
+			y1 = vy[ind1];
+			y2 = vy[ind2];
+			if (y1 < y2) {
+				x1 = vx[ind1];
+				x2 = vx[ind2];
+			} else if (y1 > y2) {
+				y2 = vy[ind1];
+				y1 = vy[ind2];
+				x2 = vx[ind1];
+				x1 = vx[ind2];
+			} else {
+				continue;
+			}
+			if ( ((y >= y1) && (y < y2)) || ((y == maxy) && (y > y1) && (y <= y2)) ) {
+				gfxPrimitivesPolyInts[ints++] = ((65536 * (y - y1)) / (y2 - y1)) * (x2 - x1) + (65536 * x1);
+			} 
+		}
+
+		qsort(gfxPrimitivesPolyInts, ints, sizeof(int), _gfxPrimitivesCompareInt);
+
+		for (i = 0; (i < ints); i += 2) {
+			xa = gfxPrimitivesPolyInts[i] + 1;
+			xa = (xa >> 16) + ((xa & 32768) >> 15);
+			xb = gfxPrimitivesPolyInts[i+1] - 1;
+			xb = (xb >> 16) + ((xb & 32768) >> 15);
+			result |= _HLineTextured(dst, xa, xb, y, texture, texture_dx, texture_dy);
+		}
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draws a polygon filled with the given texture. 
+
+This standard version is calling multithreaded versions with NULL cache parameters.
+
+\param dst the destination surface, 
+\param vx array of x vector components
+\param vy array of x vector components
+\param n the amount of vectors in the vx and vy array
+\param texture the sdl surface to use to fill the polygon
+\param texture_dx the offset of the texture relative to the screeen. if you move the polygon 10 pixels 
+to the left and want the texture to apear the same you need to increase the texture_dx value
+\param texture_dy see texture_dx
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int texturedPolygon(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, SDL_Surface *texture, int texture_dx, int texture_dy)
+{
+	/*
+	* Draw
+	*/
+	return (texturedPolygonMT(dst, vx, vy, n, texture, texture_dx, texture_dy, NULL, NULL));
+}
+
+
+/* ---- Character */
+
+/*!
+\brief Global cache for NxM pixel font surfaces created at runtime.
+*/
+static SDL_Surface *gfxPrimitivesFont[256];
+
+/*!
+\brief Global cache of the color used for the font surfaces created at runtime.
+*/
+static Uint32 gfxPrimitivesFontColor[256];
+
+/*!
+\brief Pointer to the current font data. Default is a 8x8 pixel internal font. 
+*/
+static const unsigned char *currentFontdata = gfxPrimitivesFontdata;
+
+/*!
+\brief Width of the current font. Default is 8. 
+*/
+static Uint32 charWidth = 8;
+
+/*!
+\brief Height of the current font. Default is 8. 
+*/
+static Uint32 charHeight = 8;
+
+/*!
+\brief Width for rendering. Autocalculated.
+*/
+static Uint32 charWidthLocal = 8;
+
+/*!
+\brief Height for rendering. Autocalculated.
+*/
+static Uint32 charHeightLocal = 8;
+
+/*!
+\brief Pitch of the current font in bytes. Default is 1. 
+*/
+static Uint32 charPitch = 1;
+
+/*!
+\brief Characters 90deg clockwise rotations. Default is 0. Max is 3. 
+*/
+static Uint32 charRotation = 0;
+
+/*!
+\brief Character data size in bytes of the current font. Default is 8. 
+*/
+static Uint32 charSize = 8;
+
+/*!
+\brief Sets or resets the current global font data.
+
+The font data array is organized in follows: 
+[fontdata] = [character 0][character 1]...[character 255] where
+[character n] = [byte 1 row 1][byte 2 row 1]...[byte {pitch} row 1][byte 1 row 2] ...[byte {pitch} row height] where
+[byte n] = [bit 0]...[bit 7] where 
+[bit n] = [0 for transparent pixel|1 for colored pixel]
+
+\param fontdata Pointer to array of font data. Set to NULL, to reset global font to the default 8x8 font.
+\param cw Width of character in bytes. Ignored if fontdata==NULL.
+\param ch Height of character in bytes. Ignored if fontdata==NULL.
+*/
+void gfxPrimitivesSetFont(const void *fontdata, Uint32 cw, Uint32 ch)
+{
+	int i;
+
+	if ((fontdata) && (cw) && (ch)) {
+		currentFontdata = fontdata;
+		charWidth = cw;
+		charHeight = ch;
+	} else {
+		currentFontdata = gfxPrimitivesFontdata;
+		charWidth = 8;
+		charHeight = 8;
+	}
+
+	charPitch = (charWidth+7)/8;
+	charSize = charPitch * charHeight;
+
+	/* Maybe flip width/height for rendering */
+	if ((charRotation==1) || (charRotation==3))
+	{
+		charWidthLocal = charHeight;
+		charHeightLocal = charWidth;
+	}
+	else
+	{
+		charWidthLocal = charWidth;
+		charHeightLocal = charHeight;
+	}
+
+	/* Clear character cache */
+	for (i = 0; i < 256; i++) {
+		if (gfxPrimitivesFont[i]) {
+			SDL_FreeSurface(gfxPrimitivesFont[i]);
+			gfxPrimitivesFont[i] = NULL;
+		}
+	}
+}
+
+/*!
+\brief Sets current global font character rotation steps. 
+
+Default is 0 (no rotation). 1 = 90deg clockwise. 2 = 180deg clockwise. 3 = 270deg clockwise.
+Changing the rotation, will reset the character cache.
+
+\param rotation Number of 90deg clockwise steps to rotate
+*/
+void gfxPrimitivesSetFontRotation(Uint32 rotation)
+{
+	int i;
+
+	rotation = rotation & 3;
+	if (charRotation != rotation)
+	{
+		/* Store rotation */
+		charRotation = rotation;
+
+		/* Maybe flip width/height for rendering */
+		if ((charRotation==1) || (charRotation==3))
+		{
+			charWidthLocal = charHeight;
+			charHeightLocal = charWidth;
+		}
+		else
+		{
+			charWidthLocal = charWidth;
+			charHeightLocal = charHeight;
+		}
+
+		/* Clear character cache */
+		for (i = 0; i < 256; i++) {
+			if (gfxPrimitivesFont[i]) {
+				SDL_FreeSurface(gfxPrimitivesFont[i]);
+				gfxPrimitivesFont[i] = NULL;
+			}
+		}
+	}
+}
+
+/*!
+\brief Draw a character of the currently set font.
+
+On first call for a particular character and color combination, the function needs to
+generate the character surface (slower. Subsequent calls blit a cached surface (fast). 
+Uses alpha blending if A<255 in color.
+
+\param dst The surface to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the character.
+\param y Y (vertical) coordinate of the upper left corner of the character.
+\param c The character to draw.
+\param color The color value of the character to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int characterColor(SDL_Surface * dst, Sint16 x, Sint16 y, char c, Uint32 color)
+{
+	Sint16 left, right, top, bottom;
+	Sint16 x1, y1, x2, y2;
+	SDL_Rect srect;
+	SDL_Rect drect;
+	int result;
+	Uint32 ix, iy;
+	const unsigned char *charpos;
+	Uint8 *curpos;
+	int forced_redraw;
+	Uint8 patt, mask;
+	Uint8 *linepos;
+	Uint32 pitch;
+	SDL_Surface *rotatedCharacter;
+	Uint32 ci;
+
+	/*
+	* Check visibility of clipping rectangle
+	*/
+	if ((dst->clip_rect.w==0) || (dst->clip_rect.h==0)) {
+		return(0);
+	}
+
+	/*
+	* Get text and clipping boundary and
+	* test if bounding box of character is visible 
+	*/
+
+	left = dst->clip_rect.x;
+	x2 = x + charWidthLocal;
+	if (x2<left) {
+		return(0);
+	} 
+	right = dst->clip_rect.x + dst->clip_rect.w - 1;
+	x1 = x;
+	if (x1>right) {
+		return(0);
+	} 
+	top = dst->clip_rect.y;
+	y2 = y + charHeightLocal;
+	if (y2<top) {
+		return(0);
+	} 
+	bottom = dst->clip_rect.y + dst->clip_rect.h - 1;
+	y1 = y;
+	if (y1>bottom) {
+		return(0);
+	} 
+
+	/*
+	* Setup source rectangle
+	*/
+	srect.x = 0;
+	srect.y = 0;
+	srect.w = charWidthLocal;
+	srect.h = charHeightLocal;
+
+	/*
+	* Setup destination rectangle
+	*/
+	drect.x = x;
+	drect.y = y;
+	drect.w = charWidthLocal;
+	drect.h = charHeightLocal;
+
+	/* Character index in cache */
+	ci = (unsigned char) c;
+
+	/*
+	* Create new charWidth x charHeight bitmap surface if not already present.
+	* Might get rotated later.
+	*/
+	if (gfxPrimitivesFont[ci] == NULL) {
+		gfxPrimitivesFont[ci] =
+			SDL_CreateRGBSurface(SDL_SWSURFACE | SDL_HWSURFACE | SDL_SRCALPHA,
+			charWidth, charHeight, 32,
+			0xFF000000, 0x00FF0000, 0x0000FF00, 0x000000FF);
+		/*
+		* Check pointer 
+		*/
+		if (gfxPrimitivesFont[ci] == NULL) {
+			return (-1);
+		}
+		/*
+		* Definitely redraw 
+		*/
+		forced_redraw = 1;
+	} else {
+		forced_redraw = 0;
+	}
+
+	/*
+	* Check if color has changed 
+	*/
+	if ((gfxPrimitivesFontColor[ci] != color) || (forced_redraw)) {
+		/*
+		* Redraw character 
+		*/
+		SDL_SetAlpha(gfxPrimitivesFont[ci], SDL_SRCALPHA, 255);
+		gfxPrimitivesFontColor[ci] = color;
+
+		/* Lock font-surface */
+		if (SDL_LockSurface(gfxPrimitivesFont[ci]) != 0)
+			return (-1);
+
+		/*
+		* Variable setup 
+		*/
+		charpos = currentFontdata + ci * charSize;
+		linepos = (Uint8 *) gfxPrimitivesFont[ci]->pixels;
+		pitch = gfxPrimitivesFont[ci]->pitch;
+
+		/*
+		* Drawing loop 
+		*/
+		patt = 0;
+		for (iy = 0; iy < charHeight; iy++) {
+			mask = 0x00;
+			curpos = linepos;
+			for (ix = 0; ix < charWidth; ix++) {
+				if (!(mask >>= 1)) {
+					patt = *charpos++;
+					mask = 0x80;
+				}
+
+				if (patt & mask)
+					*(Uint32 *)curpos = color;
+				else
+					*(Uint32 *)curpos = 0;
+				curpos += 4;
+			}
+			linepos += pitch;
+		}
+
+		/* Unlock font-surface */
+		SDL_UnlockSurface(gfxPrimitivesFont[ci]);
+
+		/* Maybe rotate and replace cached image */
+		if (charRotation>0)
+		{
+			rotatedCharacter = rotateSurface90Degrees(gfxPrimitivesFont[ci], charRotation);
+			SDL_FreeSurface(gfxPrimitivesFont[ci]);
+			gfxPrimitivesFont[ci] = rotatedCharacter;
+		}
+	}
+
+	/*
+	* Draw bitmap onto destination surface 
+	*/
+	result = SDL_BlitSurface(gfxPrimitivesFont[ci], &srect, dst, &drect);
+
+	return (result);
+}
+
+/*!
+\brief Draw a character of the currently set font.
+
+\param dst The surface to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the character.
+\param y Y (vertical) coordinate of the upper left corner of the character.
+\param c The character to draw.
+\param r The red value of the character to draw. 
+\param g The green value of the character to draw. 
+\param b The blue value of the character to draw. 
+\param a The alpha value of the character to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int characterRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, char c, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (characterColor(dst, x, y, c, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/*!
+\brief Draw a string in the currently set font.
+
+The spacing between consequtive characters in the string is the fixed number of pixels 
+of the character width of the current global font.
+
+\param dst The surface to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the string.
+\param y Y (vertical) coordinate of the upper left corner of the string.
+\param s The string to draw.
+\param color The color value of the string to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int stringColor(SDL_Surface * dst, Sint16 x, Sint16 y, const char *s, Uint32 color)
+{
+	int result = 0;
+	Sint16 curx = x;
+	Sint16 cury = y;
+	const char *curchar = s;
+
+	while (*curchar && !result) {
+		result |= characterColor(dst, curx, cury, *curchar, color);
+		switch (charRotation)
+		{
+		case 0:
+			curx += charWidthLocal;
+			break;
+		case 2:
+			curx -= charWidthLocal;
+			break;
+		case 1:
+			cury += charHeightLocal;
+			break;
+		case 3:
+			cury -= charHeightLocal;
+			break;
+		}
+		curchar++;
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw a string in the currently set font.
+
+\param dst The surface to draw on.
+\param x X (horizontal) coordinate of the upper left corner of the string.
+\param y Y (vertical) coordinate of the upper left corner of the string.
+\param s The string to draw.
+\param r The red value of the string to draw. 
+\param g The green value of the string to draw. 
+\param b The blue value of the string to draw. 
+\param a The alpha value of the string to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int stringRGBA(SDL_Surface * dst, Sint16 x, Sint16 y, const char *s, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (stringColor(dst, x, y, s, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+/* ---- Bezier curve */
+
+/*!
+\brief Internal function to calculate bezier interpolator of data array with ndata values at position 't'.
+
+\param data Array of values.
+\param ndata Size of array.
+\param t Position for which to calculate interpolated value. t should be between [0, ndata].
+
+\returns Interpolated value at position t, value[0] when t<0, value[n-1] when t>n.
+*/
+double _evaluateBezier (double *data, int ndata, double t) 
+{
+	double mu, result;
+	int n,k,kn,nn,nkn;
+	double blend,muk,munk;
+
+	/* Sanity check bounds */
+	if (t<0.0) {
+		return(data[0]);
+	}
+	if (t>=(double)ndata) {
+		return(data[ndata-1]);
+	}
+
+	/* Adjust t to the range 0.0 to 1.0 */ 
+	mu=t/(double)ndata;
+
+	/* Calculate interpolate */
+	n=ndata-1;
+	result=0.0;
+	muk = 1;
+	munk = pow(1-mu,(double)n);
+	for (k=0;k<=n;k++) {
+		nn = n;
+		kn = k;
+		nkn = n - k;
+		blend = muk * munk;
+		muk *= mu;
+		munk /= (1-mu);
+		while (nn >= 1) {
+			blend *= nn;
+			nn--;
+			if (kn > 1) {
+				blend /= (double)kn;
+				kn--;
+			}
+			if (nkn > 1) {
+				blend /= (double)nkn;
+				nkn--;
+			}
+		}
+		result += data[k] * blend;
+	}
+
+	return (result);
+}
+
+/*!
+\brief Draw a bezier curve with alpha blending.
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the bezier curve.
+\param vy Vertex array containing Y coordinates of the points of the bezier curve.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param s Number of steps for the interpolation. Minimum number is 2.
+\param color The color value of the bezier curve to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int bezierColor(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, int s, Uint32 color)
+{
+	int result;
+	int i;
+	double *x, *y, t, stepsize;
+	Sint16 x1, y1, x2, y2;
+
+	/*
+	* Sanity check 
+	*/
+	if (n < 3) {
+		return (-1);
+	}
+	if (s < 2) {
+		return (-1);
+	}
+
+	/*
+	* Variable setup 
+	*/
+	stepsize=(double)1.0/(double)s;
+
+	/* Transfer vertices into float arrays */
+	if ((x=(double *)malloc(sizeof(double)*(n+1)))==NULL) {
+		return(-1);
+	}
+	if ((y=(double *)malloc(sizeof(double)*(n+1)))==NULL) {
+		free(x);
+		return(-1);
+	}    
+	for (i=0; i<n; i++) {
+		x[i]=(double)vx[i];
+		y[i]=(double)vy[i];
+	}      
+	x[n]=(double)vx[0];
+	y[n]=(double)vy[0];
+
+	/*
+	* Draw 
+	*/
+	result = 0;
+	t=0.0;
+	x1=(Sint16)lrint(_evaluateBezier(x,n+1,t));
+	y1=(Sint16)lrint(_evaluateBezier(y,n+1,t));
+	for (i = 0; i <= (n*s); i++) {
+		t += stepsize;
+		x2=(Sint16)_evaluateBezier(x,n,t);
+		y2=(Sint16)_evaluateBezier(y,n,t);
+		result |= lineColor(dst, x1, y1, x2, y2, color);
+		x1 = x2;
+		y1 = y2;
+	}
+
+	/* Clean up temporary array */
+	free(x);
+	free(y);
+
+	return (result);
+}
+
+/*!
+\brief Draw a bezier curve with alpha blending.
+
+\param dst The surface to draw on.
+\param vx Vertex array containing X coordinates of the points of the bezier curve.
+\param vy Vertex array containing Y coordinates of the points of the bezier curve.
+\param n Number of points in the vertex array. Minimum number is 3.
+\param s Number of steps for the interpolation. Minimum number is 2.
+\param r The red value of the bezier curve to draw. 
+\param g The green value of the bezier curve to draw. 
+\param b The blue value of the bezier curve to draw. 
+\param a The alpha value of the bezier curve to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int bezierRGBA(SDL_Surface * dst, const Sint16 * vx, const Sint16 * vy, int n, int s, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	/*
+	* Draw 
+	*/
+	return (bezierColor(dst, vx, vy, n, s, ((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
+
+
+/*!
+\brief Internal function to initialize the Bresenham line iterator.
+
+Example of use:
+SDL_gfxBresenhamIterator b;
+_bresenhamInitialize (&b, x1, y1, x2, y2);
+do { 
+plot(b.x, b.y); 
+} while (_bresenhamIterate(&b)==0); 
+
+\param b Pointer to struct for bresenham line drawing state.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int _bresenhamInitialize(SDL_gfxBresenhamIterator *b, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2)
+{
+	int temp;
+
+	if (b==NULL) {
+		return(-1);
+	}
+
+	b->x = x1;
+	b->y = y1;
+
+	/* dx = abs(x2-x1), s1 = sign(x2-x1) */
+	if ((b->dx = x2 - x1) != 0) {
+		if (b->dx < 0) {
+			b->dx = -b->dx;
+			b->s1 = -1;
+		} else {
+			b->s1 = 1;
+		}
+	} else {
+		b->s1 = 0;	
+	}
+
+	/* dy = abs(y2-y1), s2 = sign(y2-y1)    */
+	if ((b->dy = y2 - y1) != 0) {
+		if (b->dy < 0) {
+			b->dy = -b->dy;
+			b->s2 = -1;
+		} else {
+			b->s2 = 1;
+		}
+	} else {
+		b->s2 = 0;	
+	}
+
+	if (b->dy > b->dx) {
+		temp = b->dx;
+		b->dx = b->dy;
+		b->dy = temp;
+		b->swapdir = 1;
+	} else {
+		b->swapdir = 0;
+	}
+
+	b->count = (b->dx<0) ? 0 : (unsigned int)b->dx;
+	b->dy <<= 1;
+	b->error = b->dy - b->dx;
+	b->dx <<= 1;	
+
+	return(0);
+}
+
+
+/*!
+\brief Internal function to move Bresenham line iterator to the next position.
+
+Maybe updates the x and y coordinates of the iterator struct.
+
+\param b Pointer to struct for bresenham line drawing state.
+
+\returns Returns 0 on success, 1 if last point was reached, 2 if moving past end-of-line, -1 on failure.
+*/
+int _bresenhamIterate(SDL_gfxBresenhamIterator *b)
+{	
+	if (b==NULL) {
+		return (-1);
+	}
+
+	/* last point check */
+	if (b->count==0) {
+		return (2);
+	}
+
+	while (b->error >= 0) {
+		if (b->swapdir) {
+			b->x += b->s1;
+		} else  {
+			b->y += b->s2;
+		}
+
+		b->error -= b->dx;
+	}
+
+	if (b->swapdir) {
+		b->y += b->s2;
+	} else {
+		b->x += b->s1;
+	}
+
+	b->error += b->dy;	
+	b->count--;		
+
+	/* count==0 indicates "end-of-line" */
+	return ((b->count) ? 0 : 1);
+}
+
+
+/*!
+\brief Internal function to to draw parallel lines with Murphy algorithm.
+
+\param m Pointer to struct for murphy iterator.
+\param x X coordinate of point.
+\param y Y coordinate of point.
+\param d1 Direction square/diagonal.
+*/
+void _murphyParaline(SDL_gfxMurphyIterator *m, Sint16 x, Sint16 y, int d1)
+{
+	int p;
+	d1 = -d1;
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(m->dst)) {
+		SDL_LockSurface(m->dst);
+	}
+
+	for (p = 0; p <= m->u; p++) {
+
+		pixelColorNolock(m->dst, x, y, m->color);
+
+		if (d1 <= m->kt) {
+			if (m->oct2 == 0) {
+				x++;
+			} else {
+				if (m->quad4 == 0) {
+					y++;
+				} else {
+					y--;
+				}
+			}
+			d1 += m->kv;
+		} else {	
+			x++;
+			if (m->quad4 == 0) {
+				y++;
+			} else {
+				y--;
+			}
+			d1 += m->kd;
+		}
+	}
+
+	/* Unlock surface */
+	if (SDL_MUSTLOCK(m->dst)) {
+		SDL_UnlockSurface(m->dst);
+	}
+
+	m->tempx = x;
+	m->tempy = y;
+}
+
+/*!
+\brief Internal function to to draw one iteration of the Murphy algorithm.
+
+\param m Pointer to struct for murphy iterator.
+\param miter Iteration count.
+\param ml1bx X coordinate of a point.
+\param ml1by Y coordinate of a point.
+\param ml2bx X coordinate of a point.
+\param ml2by Y coordinate of a point.
+\param ml1x X coordinate of a point.
+\param ml1y Y coordinate of a point.
+\param ml2x X coordinate of a point.
+\param ml2y Y coordinate of a point.
+
+*/
+void _murphyIteration(SDL_gfxMurphyIterator *m, Uint8 miter, 
+	Uint16 ml1bx, Uint16 ml1by, Uint16 ml2bx, Uint16 ml2by, 
+	Uint16 ml1x, Uint16 ml1y, Uint16 ml2x, Uint16 ml2y)
+{
+	int atemp1, atemp2;
+	int ftmp1, ftmp2;
+	Uint16 m1x, m1y, m2x, m2y;	
+	Uint16 fix, fiy, lax, lay, curx, cury;
+	Uint16 px[4], py[4];
+	SDL_gfxBresenhamIterator b;
+
+	if (miter > 1) {
+		if (m->first1x != -32768) {
+			fix = (m->first1x + m->first2x) / 2;
+			fiy = (m->first1y + m->first2y) / 2;
+			lax = (m->last1x + m->last2x) / 2;
+			lay = (m->last1y + m->last2y) / 2;
+			curx = (ml1x + ml2x) / 2;
+			cury = (ml1y + ml2y) / 2;
+
+			atemp1 = (fix - curx);
+			atemp2 = (fiy - cury);
+			ftmp1 = atemp1 * atemp1 + atemp2 * atemp2;
+			atemp1 = (lax - curx);
+			atemp2 = (lay - cury);
+			ftmp2 = atemp1 * atemp1 + atemp2 * atemp2;
+
+			if (ftmp1 <= ftmp2) {
+				m1x = m->first1x;
+				m1y = m->first1y;
+				m2x = m->first2x;
+				m2y = m->first2y;
+			} else {
+				m1x = m->last1x;
+				m1y = m->last1y;
+				m2x = m->last2x;
+				m2y = m->last2y;
+			}
+
+			atemp1 = (m2x - ml2x);
+			atemp2 = (m2y - ml2y);
+			ftmp1 = atemp1 * atemp1 + atemp2 * atemp2;
+			atemp1 = (m2x - ml2bx);
+			atemp2 = (m2y - ml2by);
+			ftmp2 = atemp1 * atemp1 + atemp2 * atemp2;
+
+			if (ftmp2 >= ftmp1) {
+				ftmp1 = ml2bx;
+				ftmp2 = ml2by;
+				ml2bx = ml2x;
+				ml2by = ml2y;
+				ml2x = ftmp1;
+				ml2y = ftmp2;
+				ftmp1 = ml1bx;
+				ftmp2 = ml1by;
+				ml1bx = ml1x;
+				ml1by = ml1y;
+				ml1x = ftmp1;
+				ml1y = ftmp2;
+			}
+
+			/*
+			* Lock the surface 
+			*/
+			if (SDL_MUSTLOCK(m->dst)) {
+				SDL_LockSurface(m->dst);
+			}
+
+			_bresenhamInitialize(&b, m2x, m2y, m1x, m1y);
+			do {
+				pixelColorNolock(m->dst, b.x, b.y, m->color);
+			} while (_bresenhamIterate(&b)==0);
+
+			_bresenhamInitialize(&b, m1x, m1y, ml1bx, ml1by);
+			do {
+				pixelColorNolock(m->dst, b.x, b.y, m->color);
+			} while (_bresenhamIterate(&b)==0);
+
+			_bresenhamInitialize(&b, ml1bx, ml1by, ml2bx, ml2by);
+			do {
+				pixelColorNolock(m->dst, b.x, b.y, m->color);
+			} while (_bresenhamIterate(&b)==0);
+
+			_bresenhamInitialize(&b, ml2bx, ml2by, m2x, m2y);
+			do {
+				pixelColorNolock(m->dst, b.x, b.y, m->color);
+			} while (_bresenhamIterate(&b)==0);
+
+			/* Unlock surface */
+			if (SDL_MUSTLOCK(m->dst)) {
+				SDL_UnlockSurface(m->dst);
+			}
+
+			px[0] = m1x;
+			px[1] = m2x;
+			px[2] = ml1bx;
+			px[3] = ml2bx;
+			py[0] = m1y;
+			py[1] = m2y;
+			py[2] = ml1by;
+			py[3] = ml2by;			
+			polygonColor(m->dst, px, py, 4, m->color);						
+		}
+	}
+
+	m->last1x = ml1x;
+	m->last1y = ml1y;
+	m->last2x = ml2x;
+	m->last2y = ml2y;
+	m->first1x = ml1bx;
+	m->first1y = ml1by;
+	m->first2x = ml2bx;
+	m->first2y = ml2by;
+}
+
+
+#define HYPOT(x,y) sqrt((double)(x)*(double)(x)+(double)(y)*(double)(y)) 
+
+/*!
+\brief Internal function to to draw wide lines with Murphy algorithm.
+
+Draws lines parallel to ideal line.
+
+\param m Pointer to struct for murphy iterator.
+\param x1 X coordinate of first point.
+\param y1 Y coordinate of first point.
+\param x2 X coordinate of second point.
+\param y2 Y coordinate of second point.
+\param width Width of line.
+\param miter Iteration count.
+
+*/
+void _murphyWideline(SDL_gfxMurphyIterator *m, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 width, Uint8 miter)
+{	
+	float offset = (float)width / 2.f;
+
+	Sint16 temp;
+	Sint16 ptx, pty, ptxx, ptxy, ml1x, ml1y, ml2x, ml2y, ml1bx, ml1by, ml2bx, ml2by;
+
+	int d0, d1;		/* difference terms d0=perpendicular to line, d1=along line */
+
+	int q;			/* pel counter,q=perpendicular to line */
+	int tmp;
+
+	int dd;			/* distance along line */
+	int tk;			/* thickness threshold */
+	double ang;		/* angle for initial point calculation */
+	double sang, cang;
+
+	/* Initialisation */
+	m->u = x2 - x1;	/* delta x */
+	m->v = y2 - y1;	/* delta y */
+
+	if (m->u < 0) {	/* swap to make sure we are in quadrants 1 or 4 */
+		temp = x1;
+		x1 = x2;
+		x2 = temp;
+		temp = y1;
+		y1 = y2;
+		y2 = temp;		
+		m->u *= -1;
+		m->v *= -1;
+	}
+
+	if (m->v < 0) {	/* swap to 1st quadrant and flag */
+		m->v *= -1;
+		m->quad4 = 1;
+	} else {
+		m->quad4 = 0;
+	}
+
+	if (m->v > m->u) {	/* swap things if in 2 octant */
+		tmp = m->u;
+		m->u = m->v;
+		m->v = tmp;
+		m->oct2 = 1;
+	} else {
+		m->oct2 = 0;
+	}
+
+	m->ku = m->u + m->u;	/* change in l for square shift */
+	m->kv = m->v + m->v;	/* change in d for square shift */
+	m->kd = m->kv - m->ku;	/* change in d for diagonal shift */
+	m->kt = m->u - m->kv;	/* diag/square decision threshold */
+
+	d0 = 0;
+	d1 = 0;
+	dd = 0;
+
+	ang = atan((double) m->v / (double) m->u);	/* calc new initial point - offset both sides of ideal */	
+	sang = sin(ang);
+	cang = cos(ang);
+
+	if (m->oct2 == 0) {
+		ptx = x1 + (Sint16)lrint(offset * sang);
+		if (m->quad4 == 0) {
+			pty = y1 - (Sint16)lrint(offset * cang);
+		} else {
+			pty = y1 + (Sint16)lrint(offset * cang);
+		}
+	} else {
+		ptx = x1 - (Sint16)lrint(offset * cang);
+		if (m->quad4 == 0) {
+			pty = y1 + (Sint16)lrint(offset * sang);
+		} else {
+			pty = y1 - (Sint16)lrint(offset * sang);
+		}
+	}
+
+	/* used here for constant thickness line */
+	tk = (int) (4. * HYPOT(ptx - x1, pty - y1) * HYPOT(m->u, m->v));
+
+	if (miter == 0) {
+		m->first1x = -32768;
+		m->first1y = -32768;
+		m->first2x = -32768;
+		m->first2y = -32768;
+		m->last1x = -32768;
+		m->last1y = -32768;
+		m->last2x = -32768;
+		m->last2y = -32768;
+	}
+	ptxx = ptx;
+	ptxy = pty;
+
+	for (q = 0; dd <= tk; q++) {	/* outer loop, stepping perpendicular to line */
+
+		_murphyParaline(m, ptx, pty, d1);	/* call to inner loop - right edge */
+		if (q == 0) {
+			ml1x = ptx;
+			ml1y = pty;
+			ml1bx = m->tempx;
+			ml1by = m->tempy;
+		} else {
+			ml2x = ptx;
+			ml2y = pty;
+			ml2bx = m->tempx;
+			ml2by = m->tempy;
+		}
+		if (d0 < m->kt) {	/* square move */
+			if (m->oct2 == 0) {
+				if (m->quad4 == 0) {
+					pty++;
+				} else {
+					pty--;
+				}
+			} else {
+				ptx++;
+			}
+		} else {	/* diagonal move */
+			dd += m->kv;
+			d0 -= m->ku;
+			if (d1 < m->kt) {	/* normal diagonal */
+				if (m->oct2 == 0) {
+					ptx--;
+					if (m->quad4 == 0) {
+						pty++;
+					} else {
+						pty--;
+					}
+				} else {
+					ptx++;
+					if (m->quad4 == 0) {
+						pty--;
+					} else {
+						pty++;
+					}
+				}
+				d1 += m->kv;
+			} else {	/* double square move, extra parallel line */
+				if (m->oct2 == 0) {
+					ptx--;
+				} else {
+					if (m->quad4 == 0) {
+						pty--;
+					} else {
+						pty++;
+					}
+				}
+				d1 += m->kd;
+				if (dd > tk) {
+					_murphyIteration(m, miter, ml1bx, ml1by, ml2bx, ml2by, ml1x, ml1y, ml2x, ml2y);
+					return;	/* breakout on the extra line */
+				}
+				_murphyParaline(m, ptx, pty, d1);
+				if (m->oct2 == 0) {
+					if (m->quad4 == 0) {
+						pty++;
+					} else {
+
+						pty--;
+					}
+				} else {
+					ptx++;
+				}
+			}
+		}
+		dd += m->ku;
+		d0 += m->kv;
+	}
+
+	_murphyIteration(m, miter, ml1bx, ml1by, ml2bx, ml2by, ml1x, ml1y, ml2x, ml2y);
+}
+
+
+/*!
+\brief Draw a thick line with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param width Width of the line in pixels. Must be >0.
+\param color The color value of the line to draw (0xRRGGBBAA). 
+
+\returns Returns 0 on success, -1 on failure.
+*/
+int thickLineColor(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 width, Uint32 color)
+{	
+	int wh;
+	SDL_gfxMurphyIterator m;
+
+	if (dst == NULL) return -1;
+	if (width < 1) return -1;
+
+	/* Special case: thick "point" */
+	if ((x1 == x2) && (y1 == y2)) {
+		wh = width / 2;
+		return boxColor(dst, x1 - wh, y1 - wh, x2 + width, y2 + width, color);		
+	}
+
+	m.dst = dst;
+	m.color = color;
+
+	_murphyWideline(&m, x1, y1, x2, y2, width, 0);
+	_murphyWideline(&m, x1, y1, x2, y2, width, 1);
+
+	return(0);
+}
+
+/*!
+\brief Draw a thick line with alpha blending.
+
+\param dst The surface to draw on.
+\param x1 X coordinate of the first point of the line.
+\param y1 Y coordinate of the first point of the line.
+\param x2 X coordinate of the second point of the line.
+\param y2 Y coordinate of the second point of the line.
+\param width Width of the line in pixels. Must be >0.
+\param r The red value of the character to draw. 
+\param g The green value of the character to draw. 
+\param b The blue value of the character to draw. 
+\param a The alpha value of the character to draw.
+
+\returns Returns 0 on success, -1 on failure.
+*/	
+int thickLineRGBA(SDL_Surface * dst, Sint16 x1, Sint16 y1, Sint16 x2, Sint16 y2, Uint8 width, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
+{
+	return (thickLineColor(dst, x1, y1, x2, y2, width, 
+		((Uint32) r << 24) | ((Uint32) g << 16) | ((Uint32) b << 8) | (Uint32) a));
+}
diff --git a/src/gfx/SDL_imageFilter.c b/src/gfx/SDL_imageFilter.c
new file mode 100644
index 0000000..f3059c9
--- /dev/null
+++ b/src/gfx/SDL_imageFilter.c
@@ -0,0 +1,7368 @@
+/*
+
+SDL_imageFilter.c: byte-image "filter" routines
+
+Copyright (C) 2001-2012  Andreas Schiffler
+Copyright (C) 2013  Sylvain Beucler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+   1. The origin of this software must not be misrepresented; you must not
+   claim that you wrote the original software. If you use this software
+   in a product, an acknowledgment in the product documentation would be
+   appreciated but is not required.
+
+   2. Altered source versions must be plainly marked as such, and must not be
+   misrepresented as being the original software.
+
+   3. This notice may not be removed or altered from any source
+   distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+/*
+
+Note: Uses inline x86 MMX or ASM optimizations if available and enabled.
+
+Note: Most of the MMX code is based on published routines 
+by Vladimir Kravtchenko at vk@cs.ubc.ca - credits go to 
+him for his work.
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Use GCC intrinsics if available: they support both i386 and x86_64,
+   provide ASM-grade performances, and lift the PUSHA/POPA issues. */
+#ifdef __GNUC__
+#  ifdef USE_MMX
+#    include <mmintrin.h>
+#  endif
+#endif
+#include <SDL_cpuinfo.h>
+#include "SDL_imageFilter.h"
+
+/*!
+\brief Swaps the byte order in a 32bit integer (LSB becomes MSB, etc.). 
+*/
+#define SWAP_32(x) (((x) >> 24) | (((x) & 0x00ff0000) >> 8)  | (((x) & 0x0000ff00) << 8)  | ((x) << 24))
+
+/* ------ Static variables ----- */
+
+/*! 
+\brief Static state which enables the use of the MMX routines. Enabled by default 
+*/
+static int SDL_imageFilterUseMMX = 1;
+
+/* Detect GCC */
+#if defined(__GNUC__)
+#define GCC__
+#endif
+
+/*!
+\brief MMX detection routine (with override flag). 
+
+\returns 1 of MMX was detected, 0 otherwise.
+*/
+int SDL_imageFilterMMXdetect(void)
+{
+	/* Check override flag */
+	if (SDL_imageFilterUseMMX == 0) {
+		return (0);
+	}
+
+        return SDL_HasMMX();
+}
+
+/*!
+\brief Disable MMX check for filter functions and and force to use non-MMX C based code.
+*/
+void SDL_imageFilterMMXoff()
+{
+	SDL_imageFilterUseMMX = 0;
+}
+
+/*!
+\brief Enable MMX check for filter functions and use MMX code if available.
+*/
+void SDL_imageFilterMMXon()
+{
+	SDL_imageFilterUseMMX = 1;
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Internal MMX Filter using Add: D = saturation255(S1 + S2) 
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1	/* load Src1 address into eax */
+			mov ebx, Src2	/* load Src2 address into ebx */
+			mov edi, Dest	/* load Dest address into edi */
+			mov ecx, SrcLength	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16	/* 16 byte alignment of the loop entry */
+L1010:
+		movq mm1, [eax]	/* load 8 bytes from Src1 into mm1 */
+		paddusb mm1, [ebx]	/* mm1=Src1+Src2 (add 8 bytes with saturation) */
+		movq [edi], mm1	/* store result in Dest */
+			add eax, 8	/* increase Src1, Src2 and Dest  */
+			add ebx, 8	/* register pointers by 8 */
+			add edi, 8
+			dec ecx	/* decrease loop counter */
+			jnz L1010	/* check loop termination, proceed if required */
+			emms /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_paddusb(*mSrc1, *mSrc2);	/* Src1+Src2 (add 8 bytes with saturation) */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Add: D = saturation255(S1 + S2) 
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAdd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* Use MMX assembly routine */
+		SDL_imageFilterAddMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 + (int) *cursrc2;
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using Mean: D = S1/2 + S2/2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+\param Mask Mask array containing 8 bytes with 0x7F value.
+]
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMeanMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength,
+						   unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{ 
+		pusha
+			mov edx, Mask /* load Mask address into edx */
+			movq mm0, [edx] /* load Mask into mm0 */
+		mov eax, Src1 /* load Src1 address into eax */
+			mov ebx, Src2 /* load Src2 address into ebx */
+			mov edi, Dest /* load Dest address into edi */
+			mov ecx, SrcLength /* load loop counter (SIZE) into ecx */
+			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16	/* 16 byte alignment of the loop entry */
+L21011:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
+		/* --- Byte shift via Word shift --- */
+		psrlw mm1, 1 	/* shift 4 WORDS of mm1 1 bit to the right */
+			psrlw mm2, 1 	/* shift 4 WORDS of mm2 1 bit to the right */
+			pand mm1, mm0   // apply Mask to 8 BYTES of mm1 */
+			/* byte     0x0f, 0xdb, 0xc8 */
+			pand mm2, mm0   // apply Mask to 8 BYTES of mm2 */
+			/* byte     0x0f, 0xdb, 0xd0 */
+			paddusb mm1,  mm2 	/* mm1=mm1+mm2 (add 8 bytes with saturation) */
+			movq [edi],  mm1 	/* store result in Dest */
+			add eax,  8 	/* increase Src1, Src2 and Dest  */
+			add ebx,  8 	/* register pointers by 8 */
+			add edi,  8
+			dec ecx 	/* decrease loop counter */
+			jnz L21011	/* check loop termination, proceed if required */
+			emms	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1 = *mSrc1,
+		      mm2 = *mSrc2;
+		mm1 = _m_psrlwi(mm1, 1);	/* shift 4 WORDS of mm1 1 bit to the right */
+		mm2 = _m_psrlwi(mm2, 1);	/* shift 4 WORDS of mm2 1 bit to the right */
+		mm1 = _m_pand(mm1, *mMask);	/* apply Mask to 8 BYTES of mm1 */
+		mm2 = _m_pand(mm2, *mMask);	/* apply Mask to 8 BYTES of mm2 */
+		*mDest = _m_paddusb(mm1, mm2);	/* mm1+mm2 (add 8 bytes with saturation) */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Mean: D = S1/2 + S2/2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMean(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMeanMMX(Src1, Src2, Dest, length, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 / 2 + (int) *cursrc2 / 2;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using Sub: D = saturation0(S1 - S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterSubMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax,  Src1 	/* load Src1 address into eax */
+			mov ebx,  Src2 	/* load Src2 address into ebx */
+			mov edi,  Dest 	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16 /* 16 byte alignment of the loop entry */
+L1012:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
+		movq [edi],  mm1 	/* store result in Dest */
+			add eax, 8 	/* increase Src1, Src2 and Dest  */
+			add ebx, 8 	/* register pointers by 8 */
+			add edi, 8
+			dec ecx	/* decrease loop counter */
+			jnz L1012	/* check loop termination, proceed if required */
+			emms /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psubusb(*mSrc1, *mSrc2);	/* Src1-Src2 (sub 8 bytes with saturation) */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Sub: D = saturation0(S1 - S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSub(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterSubMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 - (int) *cursrc2;
+		if (result < 0)
+			result = 0;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AbsDiff: D = | S1 - S2 |
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAbsDiffMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1  	/* load Src1 address into eax */
+			mov ebx, Src2 	/* load Src2 address into ebx */
+			mov edi, Dest 	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16	/* 16 byte alignment of the loop entry */
+L1013:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		movq mm2,  [ebx] 	/* load 8 bytes from Src2 into mm2 */
+		psubusb mm1,  [ebx] 	/* mm1=Src1-Src2 (sub 8 bytes with saturation) */
+		psubusb mm2,  [eax] 	/* mm2=Src2-Src1 (sub 8 bytes with saturation) */
+		por mm1,  mm2 	/* combine both mm2 and mm1 results */
+			movq [edi],  mm1 	/* store result in Dest */
+			add eax, 8 	/* increase Src1, Src2 and Dest  */
+			add ebx, 8 	/* register pointers by 8 */
+			add edi, 8
+			dec ecx 	/* decrease loop counter */
+			jnz L1013    	/* check loop termination, proceed if required */
+			emms         /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1 = _m_psubusb(*mSrc2, *mSrc1);	/* Src1-Src2 (sub 8 bytes with saturation) */
+		__m64 mm2 = _m_psubusb(*mSrc1, *mSrc2);	/* Src2-Src1 (sub 8 bytes with saturation) */
+		*mDest = _m_por(mm1, mm2);		/* combine both mm2 and mm1 results */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AbsDiff: D = | S1 - S2 |
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAbsDiff(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterAbsDiffMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = abs((int) *cursrc1 - (int) *cursrc2);
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using Mult: D = saturation255(S1 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   /* load Src1 address into eax */
+			mov ebx, Src2   /* load Src2 address into ebx */
+			mov edi, Dest   /* load Dest address into edi */
+			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
+			shr ecx, 3   /* counter/8 (MMX loads 8 bytes at a time) */
+			pxor mm0, mm0   /* zero mm0 register */
+			align 16      	/* 16 byte alignment of the loop entry */
+L1014:
+		movq mm1, [eax]   /* load 8 bytes from Src1 into mm1 */
+		movq mm3, [ebx]   /* load 8 bytes from Src2 into mm3 */
+		movq mm2, mm1   /* copy mm1 into mm2 */
+			movq mm4, mm3   /* copy mm3 into mm4  */
+			punpcklbw mm1, mm0   /* unpack low  bytes of Src1 into words */
+			punpckhbw mm2, mm0   /* unpack high bytes of Src1 into words */
+			punpcklbw mm3, mm0   /* unpack low  bytes of Src2 into words */
+			punpckhbw mm4, mm0   /* unpack high bytes of Src2 into words */
+			pmullw mm1, mm3   /* mul low  bytes of Src1 and Src2  */
+			pmullw mm2, mm4   /* mul high bytes of Src1 and Src2 */
+			/* Take abs value of the results (signed words) */
+			movq mm5, mm1   /* copy mm1 into mm5 */
+			movq mm6, mm2   /* copy mm2 into mm6 */
+			psraw mm5, 15   /* fill mm5 words with word sign bit */
+			psraw mm6, 15   /* fill mm6 words with word sign bit */
+			pxor mm1, mm5   /* take 1's compliment of only neg. words */
+			pxor mm2, mm6   /* take 1's compliment of only neg. words */
+			psubsw mm1, mm5   /* add 1 to only neg. words, W-(-1) or W-0 */
+			psubsw mm2, mm6   /* add 1 to only neg. words, W-(-1) or W-0 */
+			packuswb mm1, mm2   /* pack words back into bytes with saturation */
+			movq [edi], mm1   /* store result in Dest */
+			add eax, 8   /* increase Src1, Src2 and Dest  */
+			add ebx, 8   /* register pointers by 8 */
+			add edi, 8
+			dec ecx 	/* decrease loop counter */
+			jnz L1014	/* check loop termination, proceed if required */
+			emms /* exit MMX state */
+			popa
+	}
+#else
+	/* i386 ASM with constraints: */
+	/* asm volatile ( */
+	/* 	"shr $3, %%ecx \n\t"	/\* counter/8 (MMX loads 8 bytes at a time) *\/ */
+	/* 	"pxor      %%mm0, %%mm0 \n\t"	/\* zero mm0 register *\/ */
+	/* 	".align 16       \n\t"	/\* 16 byte alignment of the loop entry *\/ */
+	/* 	"1: movq (%%eax), %%mm1 \n\t"     /\* load 8 bytes from Src1 into mm1 *\/ */
+	/* 	"movq    (%%ebx), %%mm3 \n\t"	/\* load 8 bytes from Src2 into mm3 *\/ */
+	/* 	"movq      %%mm1, %%mm2 \n\t"	/\* copy mm1 into mm2 *\/ */
+	/* 	"movq      %%mm3, %%mm4 \n\t"	/\* copy mm3 into mm4  *\/ */
+	/* 	"punpcklbw %%mm0, %%mm1 \n\t"	/\* unpack low  bytes of Src1 into words *\/ */
+	/* 	"punpckhbw %%mm0, %%mm2 \n\t"	/\* unpack high bytes of Src1 into words *\/ */
+	/* 	"punpcklbw %%mm0, %%mm3 \n\t"	/\* unpack low  bytes of Src2 into words *\/ */
+	/* 	"punpckhbw %%mm0, %%mm4 \n\t"	/\* unpack high bytes of Src2 into words *\/ */
+	/* 	"pmullw    %%mm3, %%mm1 \n\t"	/\* mul low  bytes of Src1 and Src2  *\/ */
+	/* 	"pmullw    %%mm4, %%mm2 \n\t"	/\* mul high bytes of Src1 and Src2 *\/ */
+	/* 	/\* Take abs value of the results (signed words) *\/ */
+	/* 	"movq      %%mm1, %%mm5 \n\t"	/\* copy mm1 into mm5 *\/ */
+	/* 	"movq      %%mm2, %%mm6 \n\t"	/\* copy mm2 into mm6 *\/ */
+	/* 	"psraw       $15, %%mm5 \n\t"	/\* fill mm5 words with word sign bit *\/ */
+	/* 	"psraw       $15, %%mm6 \n\t"	/\* fill mm6 words with word sign bit *\/ */
+	/* 	"pxor      %%mm5, %%mm1 \n\t"	/\* take 1's compliment of only neg. words *\/ */
+	/* 	"pxor      %%mm6, %%mm2 \n\t"	/\* take 1's compliment of only neg. words *\/ */
+	/* 	"psubsw    %%mm5, %%mm1 \n\t"	/\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
+	/* 	"psubsw    %%mm6, %%mm2 \n\t"	/\* add 1 to only neg. words, W-(-1) or W-0 *\/ */
+	/* 	"packuswb  %%mm2, %%mm1 \n\t"	/\* pack words back into bytes with saturation *\/ */
+	/* 	"movq    %%mm1, (%%edi) \n\t"	/\* store result in Dest *\/ */
+	/* 	"add $8, %%eax \n\t"	/\* increase Src1, Src2 and Dest  *\/ */
+	/* 	"add $8, %%ebx \n\t"	/\* register pointers by 8 *\/ */
+	/* 	"add $8, %%edi \n\t" */
+	/* 	"dec %%ecx     \n\t"	/\* decrease loop counter *\/ */
+	/* 	"jnz 1b        \n\t"	/\* check loop termination, proceed if required *\/ */
+	/* 	"emms          \n\t"	/\* exit MMX state *\/ */
+	/* 	: "+a" (Src1),		/\* load Src1 address into rax, modified by the loop *\/ */
+	/* 	  "+b" (Src2),		/\* load Src2 address into rbx, modified by the loop *\/ */
+	/* 	  "+c" (SrcLength),	/\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
+	/* 	  "+D" (Dest)		/\* load Dest address into rdi, modified by the loop *\/ */
+	/* 	: */
+	/* 	: "memory",		/\* *Dest is modified *\/ */
+        /*           "mm0","mm1","mm2","mm3","mm4","mm5","mm6"	/\* registers modified *\/ */
+	/* ); */
+
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
+		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
+		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
+		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
+		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
+		mm5 = _m_psrawi(mm1, 15);		/* fill mm5 words with word sign bit */
+		mm6 = _m_psrawi(mm2, 15);		/* fill mm6 words with word sign bit */
+		mm1 = _m_pxor(mm1, mm5);		/* take 1's compliment of only neg. words */
+		mm2 = _m_pxor(mm2, mm6);		/* take 1's compliment of only neg. words */
+		mm1 = _m_psubsw(mm1, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		mm2 = _m_psubsw(mm2, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Mult: D = saturation255(S1 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMult(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMultMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+
+		/* NOTE: this is probably wrong - dunno what the MMX code does */
+
+		result = (int) *cursrc1 * (int) *cursrc2;
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal ASM Filter using MultNor: D = S1 * S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultNorASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Src1   /* load Src1 address into edx */
+			mov esi, Src2   /* load Src2 address into esi */
+			mov edi, Dest   /* load Dest address into edi */
+			mov ecx, SrcLength   /* load loop counter (SIZE) into ecx */
+			align 16 	/* 16 byte alignment of the loop entry */
+L10141:
+		mov al, [edx]   /* load a byte from Src1 */
+		mul [esi] 	/* mul with a byte from Src2 */
+		mov [edi], al   /* move a byte result to Dest */
+			inc edx 	/* increment Src1, Src2, Dest */
+			inc esi 		/* pointer registers by one */
+			inc edi
+			dec ecx	/* decrease loop counter */
+			jnz L10141  	/* check loop termination, proceed if required */
+			popa
+	}
+#else
+	/* Note: ~5% gain on i386, less efficient than C on x86_64 */
+	/* Also depends on whether this function is static (?!) */
+	asm volatile (
+		".align 16       \n\t"	/* 16 byte alignment of the loop entry */
+#  if defined(i386)
+		"1:mov  (%%edx), %%al \n\t"      /* load a byte from Src1 */
+		"mulb (%%esi)       \n\t"	/* mul with a byte from Src2 */
+		"mov %%al, (%%edi)  \n\t"       /* move a byte result to Dest */
+		"inc %%edx \n\t"		/* increment Src1, Src2, Dest */
+		"inc %%esi \n\t"		/* pointer registers by one */
+		"inc %%edi \n\t"
+		"dec %%ecx      \n\t"	/* decrease loop counter */
+#  elif defined(__x86_64__)
+		"1:mov  (%%rdx), %%al \n\t"      /* load a byte from Src1 */
+		"mulb (%%rsi)       \n\t"	/* mul with a byte from Src2 */
+		"mov %%al, (%%rdi)  \n\t"       /* move a byte result to Dest */
+		"inc %%rdx \n\t"		/* increment Src1, Src2, Dest */
+		"inc %%rsi \n\t"		/* pointer registers by one */
+		"inc %%rdi \n\t"
+		"dec %%rcx      \n\t"	/* decrease loop counter */
+#  endif
+		"jnz 1b         \n\t"	/* check loop termination, proceed if required */
+		: "+d" (Src1),		/* load Src1 address into edx */
+		  "+S" (Src2),		/* load Src2 address into esi */
+		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
+		  "+D" (Dest)		/* load Dest address into edi */
+		:
+		: "memory", "rax"
+		);
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultNor: D = S1 * S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultNor(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (SDL_imageFilterMMXdetect()) {
+		if (length > 0) {
+			/* ASM routine */
+			SDL_imageFilterMultNorASM(Src1, Src2, Dest, length);
+
+			/* Check for unaligned bytes */
+			if ((length & 7) > 0) {
+				/* Setup to process unaligned bytes */
+				istart = length & 0xfffffff8;
+				cursrc1 = &Src1[istart];
+				cursrc2 = &Src2[istart];
+				curdst = &Dest[istart];
+			} else {
+				/* No unaligned bytes - we are done */
+				return (0);
+			}
+		} else {
+			/* No bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = (int)*cursrc1 * (int)*cursrc2;  // (int) for efficiency
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using MultDivby2: D = saturation255(S1/2 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultDivby2MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{ 
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			pxor mm0,  mm0 	/* zero mm0 register */
+			align 16          	/* 16 byte alignment of the loop entry */
+L1015:
+		movq mm1,  [eax] 	/* load 8 bytes from Src1 into mm1 */
+		movq mm3,  [ebx] 	/* load 8 bytes from Src2 into mm3 */
+		movq mm2,  mm1 	/* copy mm1 into mm2 */
+			movq mm4,  mm3 	/* copy mm3 into mm4  */
+			punpcklbw mm1,  mm0 	/* unpack low  bytes of Src1 into words */
+			punpckhbw mm2,  mm0 	/* unpack high bytes of Src1 into words */
+			punpcklbw mm3,  mm0 	/* unpack low  bytes of Src2 into words */
+			punpckhbw mm4,  mm0 	/* unpack high bytes of Src2 into words */
+			psrlw mm1,  1 	/* divide mm1 words by 2, Src1 low bytes */
+			psrlw mm2,  1 	/* divide mm2 words by 2, Src1 high bytes */
+			pmullw mm1,  mm3 	/* mul low  bytes of Src1 and Src2  */
+			pmullw mm2,  mm4 	/* mul high bytes of Src1 and Src2 */
+			packuswb mm1,  mm2 	/* pack words back into bytes with saturation */
+			movq [edi],  mm1 	/* store result in Dest */
+			add eax,  8 	/* increase Src1, Src2 and Dest  */
+			add ebx,  8 	/* register pointers by 8 */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L1015       	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
+		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
+		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
+		mm1 = _m_psrlwi(mm1, 1);		/* divide mm1 words by 2, Src1 low bytes */
+		mm2 = _m_psrlwi(mm2, 1);		/* divide mm2 words by 2, Src1 high bytes */
+		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
+		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
+		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultDivby2: D = saturation255(S1/2 * S2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultDivby2(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMultDivby2MMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = ((int) *cursrc1 / 2) * (int) *cursrc2;
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultDivby4MMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			pxor mm0, mm0   	/* zero mm0 register */
+			align 16          	/* 16 byte alignment of the loop entry */
+L1016:
+		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		movq mm3, [ebx]   	/* load 8 bytes from Src2 into mm3 */
+		movq mm2, mm1   	/* copy mm1 into mm2 */
+			movq mm4, mm3   	/* copy mm3 into mm4  */
+			punpcklbw mm1, mm0   	/* unpack low  bytes of Src1 into words */
+			punpckhbw mm2, mm0   	/* unpack high bytes of Src1 into words */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of Src2 into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of Src2 into words */
+			psrlw mm1, 1   	/* divide mm1 words by 2, Src1 low bytes */
+			psrlw mm2, 1   	/* divide mm2 words by 2, Src1 high bytes */
+			psrlw mm3, 1   	/* divide mm3 words by 2, Src2 low bytes */
+			psrlw mm4, 1   	/* divide mm4 words by 2, Src2 high bytes */
+			pmullw mm1, mm3   	/* mul low  bytes of Src1 and Src2  */
+			pmullw mm2, mm4   	/* mul high bytes of Src1 and Src2 */
+			packuswb mm1, mm2   	/* pack words back into bytes with saturation */
+			movq [edi], mm1   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add ebx, 8   	/* register pointers by 8 */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L1016       	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0); /* zero mm0 register */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm1, mm2, mm3, mm4, mm5, mm6;
+		mm1 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm2 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_punpcklbw(*mSrc2, mm0);	/* unpack low  bytes of Src2 into words */
+		mm4 = _m_punpckhbw(*mSrc2, mm0);	/* unpack high bytes of Src2 into words */
+		mm1 = _m_psrlwi(mm1, 1);		/* divide mm1 words by 2, Src1 low bytes */
+		mm2 = _m_psrlwi(mm2, 1);		/* divide mm2 words by 2, Src1 high bytes */
+		mm3 = _m_psrlwi(mm3, 1);		/* divide mm3 words by 2, Src2 low bytes */
+		mm4 = _m_psrlwi(mm4, 1);		/* divide mm4 words by 2, Src2 high bytes */
+		mm1 = _m_pmullw(mm1, mm3);		/* mul low  bytes of Src1 and Src2  */
+		mm2 = _m_pmullw(mm2, mm4);		/* mul high bytes of Src1 and Src2 */
+		*mDest = _m_packuswb(mm1, mm2);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultDivby4: D = saturation255(S1/2 * S2/2)
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultDivby4(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterMultDivby4MMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = ((int) *cursrc1 / 2) * ((int) *cursrc2 / 2);
+		if (result > 255)
+			result = 255;
+		*curdst = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using BitAnd: D = S1 & S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBitAndMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16          	/* 16 byte alignment of the loop entry */
+L1017:
+		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		pand mm1, [ebx]   	/* mm1=Src1&Src2 */
+		movq [edi], mm1   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add ebx, 8   	/* register pointers by 8 */
+			add edi, 8
+			dec ecx        	/* decrease loop counter */
+			jnz L1017       	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* x86_64 ASM with constraints: */
+	/* asm volatile ( */
+	/* 	"shr $3, %%rcx \n\t"	/\* counter/8 (MMX loads 8 bytes at a time) *\/ */
+	/* 	".align 16       \n\t"	/\* 16 byte alignment of the loop entry *\/ */
+	/* 	"1: movq (%%rax), %%mm1 \n\t"	/\* load 8 bytes from Src1 into mm1 *\/ */
+	/* 	"pand    (%%rbx), %%mm1 \n\t"	/\* mm1=Src1&Src2 *\/ */
+	/* 	"movq    %%mm1, (%%rdi) \n\t"	/\* store result in Dest *\/ */
+	/* 	"add $8, %%rax \n\t"	/\* increase Src1, Src2 and Dest  *\/ */
+	/* 	"add $8, %%rbx \n\t"	/\* register pointers by 8 *\/ */
+	/* 	"add $8, %%rdi \n\t" */
+	/* 	"dec %%rcx     \n\t"	/\* decrease loop counter *\/ */
+	/* 	"jnz 1b        \n\t"	/\* check loop termination, proceed if required *\/ */
+	/* 	"emms          \n\t"	/\* exit MMX state *\/ */
+	/* 	: "+a" (Src1),		/\* load Src1 address into rax, modified by the loop *\/ */
+	/* 	  "+b" (Src2),		/\* load Src2 address into rbx, modified by the loop *\/ */
+	/* 	  "+c" (SrcLength),	/\* load loop counter (SIZE) into rcx, modified by the loop *\/ */
+	/* 	  "+D" (Dest)		/\* load Dest address into rdi, modified by the loop *\/ */
+	/* 	: */
+	/* 	: "memory",		/\* *Dest is modified *\/ */
+        /*           "mm1"			/\* register mm1 modified *\/ */
+	/* ); */
+
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_pand(*mSrc1, *mSrc2);	/* Src1&Src2 */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BitAnd: D = S1 & S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBitAnd(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()>0) && (length>7)) {
+		/*  if (length > 7) { */
+		/* Call MMX routine */
+
+		SDL_imageFilterBitAndMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = (*cursrc1) & (*cursrc2);
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using BitOr: D = S1 | S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBitOrMMX(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov ebx, Src2   	/* load Src2 address into ebx */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16          	/* 16 byte alignment of the loop entry */
+L91017:
+		movq mm1, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		por mm1, [ebx]   	/* mm1=Src1|Src2 */
+		movq [edi], mm1   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add ebx, 8   	/* register pointers by 8 */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L91017      	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mSrc2 = (__m64*)Src2;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_por(*mSrc1, *mSrc2);	/* Src1|Src2 */
+		mSrc1++;
+		mSrc2++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BitOr: D = S1 | S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBitOr(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterBitOrMMX(Src1, Src2, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			cursrc2 = &Src2[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		cursrc2 = Src2;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = *cursrc1 | *cursrc2;
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+	return (0);
+}
+
+/*!
+\brief Internal ASM Filter using Div: D = S1 / S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterDivASM(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Src1   	/* load Src1 address into edx */
+			mov esi, Src2   	/* load Src2 address into esi */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			align 16        	/* 16 byte alignment of the loop entry */
+L10191:
+		mov bl, [esi]   	/* load a byte from Src2 */
+		cmp bl, 0   	/* check if it zero */
+			jnz L10192
+			mov [edi], 255   	/* division by zero = 255 !!! */
+			jmp  L10193
+L10192:
+		xor ah, ah   	/* prepare AX, zero AH register */
+			mov al, [edx]   	/* load a byte from Src1 into AL */
+		div   bl             	/* divide AL by BL */
+			mov [edi], al   	/* move a byte result to Dest */
+L10193:
+		inc edx    	/* increment Src1, Src2, Dest */
+			inc esi    		/* pointer registers by one */
+			inc edi
+			dec ecx       	/* decrease loop counter */
+			jnz L10191     	/* check loop termination, proceed if required */
+			popa
+	}
+#else
+	/* Note: ~15% gain on i386, less efficient than C on x86_64 */
+	/* Also depends on whether the function is static (?!) */
+	/* Also depends on whether we work on malloc() or static char[] */
+	asm volatile (
+#  if defined(i386)
+		"pushl %%ebx \n\t"		/* %ebx may be the PIC register.  */
+		".align 16     \n\t"		/* 16 byte alignment of the loop entry */
+		"1: mov (%%esi), %%bl  \n\t"	/* load a byte from Src2 */
+		"cmp       $0, %%bl    \n\t"	/* check if it zero */
+		"jnz 2f                \n\t"
+		"movb  $255, (%%edi)   \n\t"	/* division by zero = 255 !!! */
+		"jmp 3f                \n\t"
+		"2: xor %%ah, %%ah     \n\t"	/* prepare AX, zero AH register */
+		"mov   (%%edx), %%al   \n\t"	/* load a byte from Src1 into AL */
+		"div   %%bl            \n\t"	/* divide AL by BL */
+		"mov   %%al, (%%edi)   \n\t"	/* move a byte result to Dest */
+		"3: inc %%edx          \n\t"	/* increment Src1, Src2, Dest */
+		"inc %%esi \n\t"		/* pointer registers by one */
+		"inc %%edi \n\t"
+		"dec %%ecx \n\t"		/* decrease loop counter */
+		"jnz 1b    \n\t"		/* check loop termination, proceed if required */
+		"popl %%ebx \n\t"		/* restore %ebx */
+		: "+d" (Src1),		/* load Src1 address into edx */
+		  "+S" (Src2),		/* load Src2 address into esi */
+		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
+		  "+D" (Dest)		/* load Dest address into edi */
+		:
+		: "memory", "rax"
+#  elif defined(__x86_64__)
+		".align 16     \n\t"		/* 16 byte alignment of the loop entry */
+		"1: mov (%%rsi), %%bl  \n\t"	/* load a byte from Src2 */
+		"cmp       $0, %%bl    \n\t"	/* check if it zero */
+		"jnz 2f                \n\t"
+		"movb  $255, (%%rdi)   \n\t"	/* division by zero = 255 !!! */
+		"jmp 3f                \n\t"
+		"2: xor %%ah, %%ah     \n\t"	/* prepare AX, zero AH register */
+		"mov   (%%rdx), %%al   \n\t"	/* load a byte from Src1 into AL */
+		"div   %%bl            \n\t"	/* divide AL by BL */
+		"mov   %%al, (%%rdi)   \n\t"	/* move a byte result to Dest */
+		"3: inc %%rdx          \n\t"	/* increment Src1, Src2, Dest */
+		"inc %%rsi \n\t"		/* pointer registers by one */
+		"inc %%rdi \n\t"
+		"dec %%rcx \n\t"		/* decrease loop counter */
+		"jnz 1b    \n\t"		/* check loop termination, proceed if required */
+		: "+d" (Src1),		/* load Src1 address into edx */
+		  "+S" (Src2),		/* load Src2 address into esi */
+		  "+c" (SrcLength),	/* load loop counter (SIZE) into ecx */
+		  "+D" (Dest)		/* load Dest address into edi */
+		:
+		: "memory", "rax", "rbx"
+#  endif
+		);
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using Div: D = S1 / S2
+
+\param Src1 Pointer to the start of the first source byte array (S1).
+\param Src2 Pointer to the start of the second source byte array (S2).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterDiv(unsigned char *Src1, unsigned char *Src2, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *cursrc2, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Src2 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (SDL_imageFilterMMXdetect()) {
+		if (length > 0) {
+			/* Call ASM routine */
+			SDL_imageFilterDivASM(Src1, Src2, Dest, length);
+
+			/* Never unaligned bytes - we are done */
+			return (0);
+		} else {
+			return (-1);
+		}
+	} 
+	
+	/* Setup to process whole image */
+	istart = 0;
+	cursrc1 = Src1;
+	cursrc2 = Src2;
+	curdst = Dest;
+
+	/* C routine to process image */
+	/* for (i = istart; i < length; i++) { */
+	/* 	if (*cursrc2 == 0) { */
+	/* 		*curdst = 255; */
+	/* 	} else { */
+	/* 		result = (int) *cursrc1 / (int) *cursrc2; */
+	/* 		*curdst = (unsigned char) result; */
+	/* 	} */
+	/* 	/\* Advance pointers *\/ */
+	/* 	cursrc1++; */
+	/* 	cursrc2++; */
+	/* 	curdst++; */
+	/* } */
+	for (i = istart; i < length; i++) {
+		if (*cursrc2 == 0) {
+			*curdst = 255;
+		} else {
+			*curdst = (int)*cursrc1 / (int)*cursrc2;  // (int) for efficiency
+		}
+		/* Advance pointers */
+		cursrc1++;
+		cursrc2++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Internal MMX Filter using BitNegation: D = !S
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBitNegationMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16          	/* 16 byte alignment of the loop entry */
+L91117:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into mm1 */
+		pxor mm0, mm1   	/* negate mm0 by xoring with mm1 */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Src1, Src2 and Dest  */
+			add edi,  8
+			dec ecx        	/* decrease loop counter */
+			jnz L91117      	/* check loop termination, proceed if required */
+			emms             	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+        __m64 mm1;
+	mm1 = _m_pcmpeqb(mm1, mm1);		/* generate all 1's in mm1 */
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_pxor(*mSrc1, mm1);	/* negate mm0 by xoring with mm1 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BitNegation: D = !S
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBitNegation(unsigned char *Src1, unsigned char *Dest, unsigned int length)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdst;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+		/* MMX routine */
+		SDL_imageFilterBitNegationMMX(Src1, Dest, length);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdst = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdst = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdst = ~(*cursrc1);
+		/* Advance pointers */
+		cursrc1++;
+		curdst++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AddByte: D = saturation255(S + C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant value to add (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 8 bytes of MM1 ** */
+			mov al, C   	/* load C into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1021:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
+		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Dest register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1021    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate C in 8 bytes of MM1 */
+	int i;
+	memset(&i, C, 4);
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_paddusb(*mSrc1, mm1);	/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AddByte: D = saturation255(S + C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant value to add (C).
+
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAddByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1, *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterAddByteMMX(Src1, Dest, length, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 + iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to add (C).
+\param D Byteorder-swapped constant to add (Cs).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
+			mov eax, C   	/* load C into EAX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			mov eax, D   	/* load D into EAX */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L11023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		paddusb mm0,  mm1 	/* MM0=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi],  mm0 	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L11023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate (int)C in 8 bytes of MM1 */
+	__m64 mm1 = _m_from_int(C);
+	__m64 mm2 = _m_from_int(C);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_paddusb(*mSrc1, mm1);	/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AddUint: D = saturation255((S[i] + Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant to add (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAddUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
+{
+	unsigned int i, j, istart, D;
+	int iC[4];
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		D=SWAP_32(C);
+		SDL_imageFilterAddUintMMX(Src1, Dest, length, C, D);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process bytes */
+	iC[3] = (int) ((C >> 24) & 0xff);
+	iC[2] = (int) ((C >> 16) & 0xff);
+	iC[1] = (int) ((C >>  8) & 0xff);
+	iC[0] = (int) ((C >>  0) & 0xff);
+	for (i = istart; i < length; i += 4) {
+		for (j = 0; j < 4; j++) {
+			if ((i+j)<length) {
+				result = (int) *cursrc1 + iC[j];
+				if (result > 255) result = 255;
+				*curdest = (unsigned char) result;
+				/* Advance pointers */
+				cursrc1++;
+				curdest++;
+			}
+		}
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using AddByteToHalf: D = saturation255(S/2 + C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to add (C).
+\param Mask Pointer to 8 mask bytes of value 0x7F.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterAddByteToHalfMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C,
+									unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 8 bytes of MM1 ** */
+			mov al, C   	/* load C into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov edx, Mask   	/* load Mask address into edx */
+			movq mm0, [edx]   	/* load Mask into mm0 */
+		mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1022:
+		movq mm2, [eax]   	/* load 8 bytes from Src1 into MM2 */
+		psrlw mm2, 1   	/* shift 4 WORDS of MM2 1 bit to the right */
+			pand mm2, mm0        // apply Mask to 8 BYTES of MM2 */
+			paddusb mm2,  mm1 	/* MM2=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi], mm2   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1022    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+	/* Duplicate C in 8 bytes of MM1 */
+	int i;
+	memset(&i, C, 4);
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm2 = _m_psrlwi(*mSrc1, 1);	/* shift 4 WORDS of MM2 1 bit to the right */
+		mm2 = _m_pand(mm2, *mMask);		/* apply Mask to 8 BYTES of MM2 */
+							/* byte     0x0f, 0xdb, 0xd0 */
+		*mDest = _m_paddusb(mm1, mm2);		/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using AddByteToHalf: D = saturation255(S/2 + C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant to add (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterAddByteToHalf(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterAddByteToHalfMMX(Src1, Dest, length, C, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) (*cursrc1 / 2) + iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using SubByte: D = saturation0(S - C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to subtract (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSubByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 8 bytes of MM1 ** */
+			mov al, C   	/* load C into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psubusb mm0,  mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate C in 8 bytes of MM1 */
+	int i;
+	memset(&i, C, 4);
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psubusb(*mSrc1, mm1);	/* Src1-C (sub 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using SubByte: D = saturation0(S - C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+\param C Constant to subtract (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSubByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterSubByteMMX(Src1, Dest, length, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 - iC;
+		if (result < 0)
+			result = 0;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to subtract (C).
+\param D Byteorder-swapped constant to subtract (Cs).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterSubUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned int C, unsigned int D)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate (int)C in 8 bytes of MM1 ** */
+			mov eax, C   	/* load C into EAX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			mov eax, D   	/* load D into EAX */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher bytes of MM1 with C */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L11024:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psubusb mm0, mm1 	/* MM0=SrcDest-C (sub 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L11024    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate (int)C in 8 bytes of MM1 */
+	__m64 mm1 = _m_from_int(C);
+	__m64 mm2 = _m_from_int(C);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher bytes of MM1 with C */
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psubusb(*mSrc1, mm1);	/* Src1-C (sub 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using SubUint: D = saturation0(S[i] - Cs[i % 4]), Cs=Swap32((uint)C)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param C Constant to subtract (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterSubUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned int C)
+{
+	unsigned int i, j, istart, D;
+	int iC[4];
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+    /* Special case: C==0 */
+	if (C == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		D=SWAP_32(C);
+		SDL_imageFilterSubUintMMX(Src1, Dest, length, C, D);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC[3] = (int) ((C >> 24) & 0xff);
+	iC[2] = (int) ((C >> 16) & 0xff);
+	iC[1] = (int) ((C >>  8) & 0xff);
+	iC[0] = (int) ((C >>  0) & 0xff);
+	for (i = istart; i < length; i += 4) {
+		for (j = 0; j < 4; j++) {
+			if ((i+j)<length) {
+				result = (int) *cursrc1 - iC[j];
+				if (result < 0) result = 0;
+				*curdest = (unsigned char) result;
+				/* Advance pointers */
+				cursrc1++;
+				curdest++;
+			}
+		}
+	}
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftRight: D = saturation0(S >> N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param Mask Byte array containing 8 bytes with 0x7F value.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftRightMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
+								 unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Mask   	/* load Mask address into edx */
+			movq mm0, [edx]   	/* load Mask into mm0 */
+		xor ecx, ecx   	/* zero ECX */
+			mov cl,  N 	/* load loop counter (N) into CL */
+			movd mm3,  ecx 	/* copy (N) into MM3  */
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+L10240:                  	/* ** Prepare proper bit-Mask in MM1 ** */
+		psrlw mm1,  1 	/* shift 4 WORDS of MM1 1 bit to the right */
+			pand mm1, mm0   // apply Mask to 8 BYTES of MM1 */
+			/*  byte     0x0f, 0xdb, 0xc8 */
+			dec               cl    	/* decrease loop counter */
+			jnz            L10240    	/* check loop termination, proceed if required */
+			/* ** Shift all bytes of the image ** */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx,  SrcLength 	/* load loop counter (SIZE) into ecx */
+			shr ecx,  3 	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10241:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psrlw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the right */
+			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
+			/* byte     0x0f, 0xdb, 0xc1 */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10241    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+        __m64 mm1;
+	int i;
+	mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
+	/* Prepare proper bit-Mask in MM1 */
+	for (i = 0; i < N; i++) {
+		mm1 = _m_psrlwi(mm1, 1);		/* shift 4 WORDS of MM1 1 bit to the right */
+		mm1 = _m_pand(mm1, *mMask);		/* apply Mask to 8 BYTES of MM1 */
+	}
+        /* Shift all bytes of the image */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0 = _m_psrlwi(*mSrc1, N);	/* shift 4 WORDS of MM0 (N) bits to the right */
+		*mDest = _m_pand(mm0, mm1);		/* apply proper bit-Mask to 8 BYTES of MM0 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftRight: D = saturation0(S >> N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftRight(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	static unsigned char Mask[8] = { 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F };
+	unsigned int i, istart;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Check shift */
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		/* MMX routine */
+		SDL_imageFilterShiftRightMMX(Src1, Dest, length, N, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdest = (unsigned char) *cursrc1 >> N;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftRightUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L13023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		psrld mm0, N
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L13023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_psrldi(*mSrc1, N);
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftRightUint: D = saturation0((uint)S[i] >> N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftRightUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	unsigned int *icursrc1, *icurdest;
+	unsigned int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 32) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftRightUintMMX(Src1, Dest, length, N);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	icursrc1=(unsigned int *)cursrc1;
+	icurdest=(unsigned int *)curdest;
+	for (i = istart; i < length; i += 4) {
+		if ((i+4)<length) {
+			result = ((unsigned int)*icursrc1 >> N);
+			*icurdest = result;
+		}
+		/* Advance pointers */
+		icursrc1++;
+		icurdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using MultByByte: D = saturation255(S * C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 4 words of MM1 ** */
+			mov al, C   	/* load C into AL */
+			xor ah, ah   	/* zero AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
+			pxor mm0, mm0   	/* zero MM0 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			cmp al, 128   	/* if (C <= 128) execute more efficient code */
+			jg             L10251
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10250:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
+			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10250    	/* check loop termination, proceed if required */
+			jmp            L10252
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10251:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			pmullw mm3, mm1   	/* mul low  bytes of SrcDest and MM1 */
+			pmullw mm4, mm1   	/* mul high bytes of SrcDest and MM1 */
+			/* ** Take abs value of the results (signed words) ** */
+			movq mm5, mm3   	/* copy mm3 into mm5 */
+			movq mm6, mm4   	/* copy mm4 into mm6 */
+			psraw mm5, 15   	/* fill mm5 words with word sign bit */
+			psraw mm6, 15   	/* fill mm6 words with word sign bit */
+			pxor mm3, mm5   	/* take 1's compliment of only neg words */
+			pxor mm4, mm6   	/* take 1's compliment of only neg words */
+			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
+			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10251    	/* check loop termination, proceed if required */
+L10252:
+		emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0);				/* zero mm0 register */
+	/* Duplicate C in 4 words of MM1 */
+	int i;
+	i = C | C<<16;
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);				/* fill higher words of MM1 with C */
+	// long long lli = C | C<<16 | (long long)C<<32 | (long long)C<<48;
+        //__m64 mm1 = _m_from_int64(lli); // x86_64 only
+	if (C <= 128) {						/* if (C <= 128) execute more efficient code */
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
+			mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	} else {
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4, mm5, mm6;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
+			mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
+			/* Take abs value of the results (signed words) */
+			mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
+			mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
+			mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
+			mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
+			mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	}
+	_m_empty();						/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using MultByByte: D = saturation255(S * C)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: C==1 */
+	if (C == 1) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterMultByByteMMX(Src1, Dest, length, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 * iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftRightAndMultByByteMMX: D = saturation255((S >> N) * C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftRightAndMultByByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
+											  unsigned char C)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate C in 4 words of MM1 ** */
+			mov al, C   	/* load C into AL */
+			xor ah, ah   	/* zero AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher words of MM1 with C */
+			xor ecx, ecx   	/* zero ECX */
+			mov cl, N   	/* load N into CL */
+			movd mm7, ecx   	/* copy N into MM7 */
+			pxor mm0, mm0   	/* zero MM0 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1026:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			psrlw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the right */
+			psrlw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the right */
+			pmullw mm3, mm1   	/* mul low  bytes of SrcDest by MM1 */
+			pmullw mm4, mm1   	/* mul high bytes of SrcDest by MM1 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1026    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0);			/* zero mm0 register */
+	/* Duplicate C in 4 words of MM1 */
+	int i;
+	i = (C<<16)|C;
+	__m64 mm1 = _m_from_int(i);
+	__m64 mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher words of MM1 with C */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm3, mm4, mm5, mm6;
+		mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+		mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_psrlwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the right */
+		mm4 = _m_psrlwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the right */
+		mm3 = _m_pmullw(mm3, mm1);		/* mul low  bytes of Src1 and MM1 */
+		mm4 = _m_pmullw(mm4, mm1);		/* mul high bytes of Src1 and MM1 */
+		*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftRightAndMultByByte: D = saturation255((S >> N) * C) 
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param C Constant to multiply with (C).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftRightAndMultByByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N,
+										   unsigned char C)
+{
+	unsigned int i, istart;
+	int iC;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Check shift */
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 && C==1 */
+	if ((N == 0) && (C == 1)) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftRightAndMultByByteMMX(Src1, Dest, length, N, C);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	iC = (int) C;
+	for (i = istart; i < length; i++) {
+		result = (int) (*cursrc1 >> N) * iC;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftLeftByte: D = (S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source arrays.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+\param Mask Byte array containing 8 bytes of 0xFE value.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftLeftByteMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N,
+									unsigned char *Mask)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov edx, Mask   	/* load Mask address into edx */
+			movq mm0, [edx]   	/* load Mask into mm0 */
+		xor ecx, ecx   	/* zero ECX */
+			mov cl, N   	/* load loop counter (N) into CL */
+			movd mm3, ecx   	/* copy (N) into MM3  */
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+L10270:                  	/* ** Prepare proper bit-Mask in MM1 ** */
+		psllw mm1, 1   	/* shift 4 WORDS of MM1 1 bit to the left */
+			pand mm1, mm0        // apply Mask to 8 BYTES of MM1 */
+			/*  byte     0x0f, 0xdb, 0xc8 */
+			dec cl                  	/* decrease loop counter */
+			jnz            L10270    	/* check loop termination, proceed if required */
+			/* ** Shift all bytes of the image ** */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load SrcDest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10271:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
+		psllw mm0, mm3   	/* shift 4 WORDS of MM0 (N) bits to the left */
+			pand mm0, mm1    // apply proper bit-Mask to 8 BYTES of MM0 */
+			/* byte     0x0f, 0xdb, 0xc1 */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10271    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 *mMask = (__m64*)Mask;
+        __m64 mm1;
+	int i;
+	mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
+	/* Prepare proper bit-Mask in MM1 */
+	for (i = 0; i < N; i++) {
+		mm1 = _m_psllwi(mm1, 1);		/* shift 4 WORDS of MM1 1 bit to the left */
+		mm1 = _m_pand(mm1, *mMask);		/* apply Mask to 8 BYTES of MM1 */
+	}
+	/* ** Shift all bytes of the image ** */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0 = _m_psllwi(*mSrc1, N);	/* shift 4 WORDS of MM0 (N) bits to the left */
+		*mDest = _m_pand(mm0, mm1);		/* apply proper bit-Mask to 8 BYTES of MM0 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftLeftByte: D = (S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source arrays.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftLeftByte(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	static unsigned char Mask[8] = { 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE, 0xFE };
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftLeftByteMMX(Src1, Dest, length, N, Mask);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = ((int) *cursrc1 << N) & 0xff;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ShiftLeftUint: D = ((uint)S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftLeftUintMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L12023:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		pslld mm0, N   	/* MM0=SrcDest+C (add 8 bytes with saturation) */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L12023    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	int i;
+	for (i = 0; i < SrcLength/8; i++) {
+		*mDest = _m_pslldi(*mSrc1, N);	/* Src1+C (add 8 bytes with saturation) */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ShiftLeftUint: D = ((uint)S << N)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 32.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftLeftUint(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	unsigned int *icursrc1, *icurdest;
+	unsigned int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 32) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftLeftUintMMX(Src1, Dest, length, N);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	icursrc1=(unsigned int *)cursrc1;
+	icurdest=(unsigned int *)curdest;
+	for (i = istart; i < length; i += 4) {
+		if ((i+4)<length) {
+			result = ((unsigned int)*icursrc1 << N);
+			*icurdest = result;
+		}
+		/* Advance pointers */
+		icursrc1++;
+		icurdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter ShiftLeft: D = saturation255(S << N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterShiftLeftMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char N)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			xor eax, eax   	/* zero EAX */
+			mov al, N   	/* load N into AL */
+			movd mm7, eax   	/* copy N into MM7 */
+			pxor mm0, mm0   	/* zero MM0 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			cmp al, 7   	/* if (N <= 7) execute more efficient code */
+			jg             L10281
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10280:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the left */
+			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the left */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10280    	/* check loop termination, proceed if required */
+			jmp            L10282
+			align 16                 	/* 16 byte alignment of the loop entry */
+L10281:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm0   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm0   	/* unpack high bytes of SrcDest into words */
+			psllw mm3, mm7   	/* shift 4 WORDS of MM3 (N) bits to the left */
+			psllw mm4, mm7   	/* shift 4 WORDS of MM4 (N) bits to the left */
+			/* ** Take abs value of the signed words ** */
+			movq mm5, mm3   	/* copy mm3 into mm5 */
+			movq mm6, mm4   	/* copy mm4 into mm6 */
+			psraw mm5, 15   	/* fill mm5 words with word sign bit */
+			psraw mm6, 15   	/* fill mm6 words with word sign bit */
+			pxor mm3, mm5   	/* take 1's compliment of only neg words */
+			pxor mm4, mm6   	/* take 1's compliment of only neg words */
+			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
+			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz            L10281    	/* check loop termination, proceed if required */
+L10282:
+		emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0 = _m_from_int(0);				/* zero mm0 register */
+	int i;
+	if (N <= 7) {						/* if (N <= 7) execute more efficient code */
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_psllwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the left */
+			mm4 = _m_psllwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the left */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	} else {
+		for (i = 0; i < SrcLength/8; i++) {
+			__m64 mm3, mm4, mm5, mm6;
+			mm3 = _m_punpcklbw(*mSrc1, mm0);	/* unpack low  bytes of Src1 into words */
+			mm4 = _m_punpckhbw(*mSrc1, mm0);	/* unpack high bytes of Src1 into words */
+			mm3 = _m_psllwi(mm3, N);		/* shift 4 WORDS of MM3 (N) bits to the left */
+			mm4 = _m_psllwi(mm4, N);		/* shift 4 WORDS of MM4 (N) bits to the left */
+			/* Take abs value of the signed words */
+			mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
+			mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
+			mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
+			mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
+			mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+			*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+			mSrc1++;
+			mDest++;
+		}
+	}
+	_m_empty();						/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter ShiftLeft: D = saturation255(S << N)
+
+\param Src1 Pointer to the start of the source byte array (S1).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param N Number of bit-positions to shift (N). Valid range is 0 to 8.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterShiftLeft(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char N)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1, *curdest;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if (N > 8) {
+		return (-1);
+	}
+
+	/* Special case: N==0 */
+	if (N == 0) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterShiftLeftMMX(Src1, Dest, length, N);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		result = (int) *cursrc1 << N;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief MMX BinarizeUsingThreshold: D = (S >= T) ? 255:0
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param T The threshold boundary (inclusive).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterBinarizeUsingThresholdMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char T)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			/* ** Duplicate T in 8 bytes of MM3 ** */
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+			pcmpeqb mm2, mm2   	/* generate all 1's in mm2 */
+			mov al, T   	/* load T into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm3, eax   	/* copy EAX into MM3 */
+			movd mm4, eax   	/* copy EAX into MM4 */
+			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with T */
+			psubusb mm2, mm3   	/* store 0xFF - T in MM2 */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1029:
+		movq mm0, [eax]   	/* load 8 bytes from SrcDest into MM0 */
+		paddusb mm0, mm2   	/* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
+			pcmpeqb mm0, mm1   	/* binarize 255:0, comparing to 255 */
+			movq [edi], mm0   	/* store result in SrcDest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1029    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	/* Duplicate T in 8 bytes of MM3 */
+	__m64 mm1 = _m_pcmpeqb(mm1, mm1);			/* generate all 1's in mm1 */
+	__m64 mm2 = _m_pcmpeqb(mm2, mm2);			/* generate all 1's in mm1 */
+	int i;
+	memset(&i, T, 4);
+	__m64 mm3 = _m_from_int(i);
+	__m64 mm4 = _m_from_int(i);
+	mm3 = _m_punpckldq(mm3, mm4);			/* fill higher bytes of MM3 with T */
+	mm2 = _m_psubusb(mm2, mm3);			/* store 0xFF - T in MM2 */
+        //__m64 mm3 = _m_from_int64(lli); // x86_64 only
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0 = _m_paddusb(*mSrc1, mm2);	/* Src1+(0xFF-T) (add 8 bytes with saturation) */
+		*mDest = _m_pcmpeqb(mm0, mm1);		/* binarize 255:0, comparing to 255 */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using BinarizeUsingThreshold: D = (S >= T) ? 255:0
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param T The threshold boundary (inclusive).
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterBinarizeUsingThreshold(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char T)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: T==0 */
+	if (T == 0) {
+		memset(Dest, 255, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterBinarizeUsingThresholdMMX(Src1, Dest, length, T);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		*curdest = (unsigned char)(((unsigned char)*cursrc1 >= T) ? 255 : 0);
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param Tmin Lower (inclusive) boundary of the clipping range.
+\param Tmax Upper (inclusive) boundary of the clipping range.
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterClipToRangeMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, unsigned char Tmin,
+								  unsigned char Tmax)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			pcmpeqb mm1, mm1   	/* generate all 1's in mm1 */
+			/* ** Duplicate Tmax in 8 bytes of MM3 ** */
+			mov al, Tmax   	/* load Tmax into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm3, eax   	/* copy EAX into MM3 */
+			movd mm4, eax   	/* copy EAX into MM4 */
+			punpckldq mm3, mm4   	/* fill higher bytes of MM3 with Tmax */
+			psubusb mm1, mm3   	/* store 0xFF - Tmax in MM1 */
+			/* ** Duplicate Tmin in 8 bytes of MM5 ** */
+			mov al, Tmin   	/* load Tmin into AL */
+			mov ah, al   	/* copy AL into AH */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm5, eax   	/* copy EAX into MM5 */
+			movd mm4, eax   	/* copy EAX into MM4 */
+			punpckldq mm5, mm4   	/* fill higher bytes of MM5 with Tmin */
+			movq mm7, mm5   	/* copy MM5 into MM7 */
+			paddusb mm7, mm1   	/* store 0xFF - Tmax + Tmin in MM7 */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1030:
+		movq mm0, [eax]   	/* load 8 bytes from Src1 into MM0 */
+		paddusb mm0, mm1   	/* MM0=SrcDest+(0xFF-Tmax) */
+			psubusb mm0, mm7   	/* MM0=MM0-(0xFF-Tmax+Tmin) */
+			paddusb mm0, mm5   	/* MM0=MM0+Tmin */
+			movq [edi], mm0   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1030    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm1 = _m_pcmpeqb(mm1, mm1);	/* generate all 1's in mm1 */
+	int i;
+	/* Duplicate Tmax in 8 bytes of MM3 */
+	__m64 mm3, mm4;
+	memset(&i, Tmax, 4);
+	mm3 = _m_from_int(i);
+	mm4 = _m_from_int(i);
+	mm3 = _m_punpckldq(mm3, mm4);		/* fill higher bytes of MM3 with Tmax */
+	mm1 = _m_psubusb(mm1, mm3);		/* store 0xFF - Tmax in MM1 */
+        //__m64 mm3 = _m_from_int64(lli); // x86_64 only
+	/* Duplicate Tmax in 8 bytes of MM3 */
+	__m64 mm5, mm7;
+	memset(&i, Tmin, 4);
+	mm5 = _m_from_int(i);
+	mm4 = _m_from_int(i);
+	mm5 = _m_punpckldq(mm5, mm4);		/* fill higher bytes of MM5 with Tmin */
+	mm7 = _m_paddusb(mm5, mm1);	/* store 0xFF - Tmax + Tmin in MM7 */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm0;
+		mm0 = _m_paddusb(*mSrc1, mm1);	/* MM0=Src1+(0xFF-Tmax) */
+		mm0 = _m_psubusb(mm0, mm7);	/* MM0=MM0-(0xFF-Tmax+Tmin) */
+		*mDest = _m_paddusb(mm0, mm5);	/* MM0+Tmin */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();				/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using ClipToRange: D = (S >= Tmin) & (S <= Tmax) S:Tmin | Tmax
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param Tmin Lower (inclusive) boundary of the clipping range.
+\param Tmax Upper (inclusive) boundary of the clipping range.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterClipToRange(unsigned char *Src1, unsigned char *Dest, unsigned int length, unsigned char Tmin,
+							   unsigned char Tmax)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc1;
+	unsigned char *curdest;
+
+	/* Validate input parameters */
+	if ((Src1 == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	/* Special case: Tmin==0 && Tmax = 255 */
+	if ((Tmin == 0) && (Tmax == 25)) {
+		memcpy(Src1, Dest, length);
+		return (0); 
+	}
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterClipToRangeMMX(Src1, Dest, length, Tmin, Tmax);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc1 = &Src1[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc1 = Src1;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	for (i = istart; i < length; i++) {
+		if (*cursrc1 < Tmin) {
+			*curdest = Tmin;
+		} else if (*cursrc1 > Tmax) {
+			*curdest = Tmax;
+		} else {
+			*curdest = *cursrc1;
+		}
+		/* Advance pointers */
+		cursrc1++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/*!
+\brief Internal MMX Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
+
+\param Src1 Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param SrcLength The number of bytes in the source array.
+\param Cmin Normalization constant (Cmin).
+\param Cmax Normalization constant (Cmax).
+\param Nmin Normalization constant (Nmin).
+\param Nmax Normalization constant (Nmax).
+
+\return Returns 0 for success or -1 for error.
+*/
+static int SDL_imageFilterNormalizeLinearMMX(unsigned char *Src1, unsigned char *Dest, unsigned int SrcLength, int Cmin, int Cmax,
+									  int Nmin, int Nmax)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{
+		pusha
+			mov ax, WORD PTR Nmax   	/* load Nmax in AX */
+			mov bx, WORD PTR Cmax   	/* load Cmax in BX */
+			sub ax, WORD PTR Nmin   	/* AX = Nmax - Nmin */
+			sub bx, WORD PTR Cmin   	/* BX = Cmax - Cmin */
+			jz             L10311    	/* check division by zero */
+			xor dx, dx   	/* prepare for division, zero DX */
+			div               bx    	/* AX = AX/BX */
+			jmp            L10312
+L10311:
+		mov ax, 255   	/* if div by zero, assume result max byte value */
+L10312:                  	/* ** Duplicate AX in 4 words of MM0 ** */
+		mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm0, eax   	/* copy EAX into MM0 */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			punpckldq mm0, mm1   	/* fill higher words of MM0 with AX */
+			/* ** Duplicate Cmin in 4 words of MM1 ** */
+			mov ax, WORD PTR Cmin   	/* load Cmin into AX */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm1, eax   	/* copy EAX into MM1 */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			punpckldq mm1, mm2   	/* fill higher words of MM1 with Cmin */
+			/* ** Duplicate Nmin in 4 words of MM2 ** */
+			mov ax, WORD PTR Nmin   	/* load Nmin into AX */
+			mov bx, ax   	/* copy AX into BX */
+			shl eax, 16   	/* shift 2 bytes of EAX left */
+			mov ax, bx   	/* copy BX into AX */
+			movd mm2, eax   	/* copy EAX into MM2 */
+			movd mm3, eax   	/* copy EAX into MM3 */
+			punpckldq mm2, mm3   	/* fill higher words of MM2 with Nmin */
+			pxor mm7, mm7   	/* zero MM7 register */
+			mov eax, Src1   	/* load Src1 address into eax */
+			mov edi, Dest   	/* load Dest address into edi */
+			mov ecx, SrcLength   	/* load loop counter (SIZE) into ecx */
+			shr ecx, 3   	/* counter/8 (MMX loads 8 bytes at a time) */
+			align 16                 	/* 16 byte alignment of the loop entry */
+L1031:
+		movq mm3, [eax]   	/* load 8 bytes from Src1 into MM3 */
+		movq mm4, mm3   	/* copy MM3 into MM4  */
+			punpcklbw mm3, mm7   	/* unpack low  bytes of SrcDest into words */
+			punpckhbw mm4, mm7   	/* unpack high bytes of SrcDest into words */
+			psubusb mm3, mm1   	/* S-Cmin, low  bytes */
+			psubusb mm4, mm1   	/* S-Cmin, high bytes */
+			pmullw mm3, mm0   	/* MM0*(S-Cmin), low  bytes */
+			pmullw mm4, mm0   	/* MM0*(S-Cmin), high bytes */
+			paddusb mm3, mm2   	/* MM0*(S-Cmin)+Nmin, low  bytes */
+			paddusb mm4, mm2   	/* MM0*(S-Cmin)+Nmin, high bytes */
+			/* ** Take abs value of the signed words ** */
+			movq mm5, mm3   	/* copy mm3 into mm5 */
+			movq mm6, mm4   	/* copy mm4 into mm6 */
+			psraw mm5, 15   	/* fill mm5 words with word sign bit */
+			psraw mm6, 15   	/* fill mm6 words with word sign bit */
+			pxor mm3, mm5   	/* take 1's compliment of only neg words */
+			pxor mm4, mm6   	/* take 1's compliment of only neg words */
+			psubsw mm3, mm5   	/* add 1 to only neg words, W-(-1) or W-0 */
+			psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+			packuswb mm3, mm4   	/* pack words back into bytes with saturation */
+			movq [edi], mm3   	/* store result in Dest */
+			add eax, 8   	/* increase Src1 register pointer by 8 */
+			add edi, 8   	/* increase Dest register pointer by 8 */
+			dec              ecx    	/* decrease loop counter */
+			jnz             L1031    	/* check loop termination, proceed if required */
+			emms                      	/* exit MMX state */
+			popa
+	}
+#else
+	/* i386 and x86_64 */
+	__m64 *mSrc1 = (__m64*)Src1;
+	__m64 *mDest = (__m64*)Dest;
+	__m64 mm0, mm1, mm2, mm3;
+
+	int i;
+	/* Duplicate (Nmax-Nmin)/(Cmax-Cmin) in 4 words of MM0 */
+	unsigned short a = Nmax - Nmin;
+	unsigned short b = Cmax - Cmin;
+	if (b == 0) {
+	    a = 255;
+	} else {
+	    a /= b;
+	}
+	i = (a<<16)|a;
+	mm0 = _m_from_int(i);
+	mm1 = _m_from_int(i);
+	mm0 = _m_punpckldq(mm0, mm1);			/* fill higher words of MM0 with AX */
+	/* Duplicate Cmin in 4 words of MM1 */
+	i = (Cmin<<16)|(short)Cmin;
+	mm1 = _m_from_int(i);
+	mm2 = _m_from_int(i);
+	mm1 = _m_punpckldq(mm1, mm2);			/* fill higher words of MM1 with Cmin */
+	/* Duplicate Nmin in 4 words of MM2 */
+	i = (Nmin<<16)|(short)Nmin;
+	mm2 = _m_from_int(i);
+	mm3 = _m_from_int(i);
+	mm2 = _m_punpckldq(mm2, mm3);			/* fill higher words of MM2 with Nmin */
+	__m64 mm7 = _m_from_int(0);			/* zero mm0 register */
+	for (i = 0; i < SrcLength/8; i++) {
+		__m64 mm3, mm4, mm5, mm6;
+		mm3 = _m_punpcklbw(*mSrc1, mm7);	/* unpack low  bytes of Src1 into words */
+		mm4 = _m_punpckhbw(*mSrc1, mm7);	/* unpack high bytes of Src1 into words */
+		mm3 = _m_psubusb(mm3, mm1);		/* S-Cmin, low	bytes */
+		mm4 = _m_psubusb(mm4, mm1);		/* S-Cmin, high bytes */
+		mm3 = _m_pmullw(mm3, mm0);		/* MM0*(S-Cmin), low  bytes */
+		mm4 = _m_pmullw(mm4, mm0);		/* MM0*(S-Cmin), high bytes */
+		mm3 = _m_paddusb(mm3, mm2);		/* MM0*(S-Cmin)+Nmin, low  bytes */
+		mm4 = _m_paddusb(mm4, mm2);		/* MM0*(S-Cmin)+Nmin, high bytes */
+		/* Take abs value of the signed words */
+		mm5 = _m_psrawi(mm3, 15);		/* fill mm5 words with word sign bit */
+		mm6 = _m_psrawi(mm4, 15);		/* fill mm6 words with word sign bit */
+		mm3 = _m_pxor(mm3, mm5);		/* take 1's compliment of only neg. words */
+		mm4 = _m_pxor(mm4, mm6);		/* take 1's compliment of only neg. words */
+		mm3 = _m_psubsw(mm3, mm5);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		mm4 = _m_psubsw(mm4, mm6);		/* add 1 to only neg. words, W-(-1) or W-0 */
+		*mDest = _m_packuswb(mm3, mm4);		/* pack words back into bytes with saturation */
+		mSrc1++;
+		mDest++;
+	}
+	_m_empty();					/* clean MMX state */
+#endif
+	return (0);
+#else
+	return (-1);
+#endif
+}
+
+/*!
+\brief Filter using NormalizeLinear: D = saturation255((Nmax - Nmin)/(Cmax - Cmin)*(S - Cmin) + Nmin)
+
+\param Src Pointer to the start of the source byte array (S).
+\param Dest Pointer to the start of the destination byte array (D).
+\param length The number of bytes in the source array.
+\param Cmin Normalization constant.
+\param Cmax Normalization constant.
+\param Nmin Normalization constant.
+\param Nmax Normalization constant.
+
+\return Returns 0 for success or -1 for error.
+*/
+int SDL_imageFilterNormalizeLinear(unsigned char *Src, unsigned char *Dest, unsigned int length, int Cmin, int Cmax, int Nmin,
+								   int Nmax)
+{
+	unsigned int i, istart;
+	unsigned char *cursrc;
+	unsigned char *curdest;
+	int dN, dC, factor;
+	int result;
+
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL))
+		return(-1);
+	if (length == 0)
+		return(0);
+
+	if ((SDL_imageFilterMMXdetect()) && (length > 7)) {
+
+		SDL_imageFilterNormalizeLinearMMX(Src, Dest, length, Cmin, Cmax, Nmin, Nmax);
+
+		/* Check for unaligned bytes */
+		if ((length & 7) > 0) {
+			/* Setup to process unaligned bytes */
+			istart = length & 0xfffffff8;
+			cursrc = &Src[istart];
+			curdest = &Dest[istart];
+		} else {
+			/* No unaligned bytes - we are done */
+			return (0);
+		}
+	} else {
+		/* Setup to process whole image */
+		istart = 0;
+		cursrc = Src;
+		curdest = Dest;
+	}
+
+	/* C routine to process image */
+	dC = Cmax - Cmin;
+	if (dC == 0)
+		return (0);
+	dN = Nmax - Nmin;
+	factor = dN / dC;
+	for (i = istart; i < length; i++) {
+		result = factor * ((int) (*cursrc) - Cmin) + Nmin;
+		if (result > 255)
+			result = 255;
+		*curdest = (unsigned char) result;
+		/* Advance pointers */
+		cursrc++;
+		curdest++;
+	}
+
+	return (0);
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Filter using ConvolveKernel3x3Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >2.
+\param Kernel The 2D convolution kernel of size 3x3.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel3x3Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 3) || (rows < 3) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
+			add edx, 8   	/* second row              |K0 K1 K2 0| */
+				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			add edx, 8   	/* third row               |K6 K7 K8 0| */
+				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
+			/* ---, */
+			mov eax, columns   	/* load columns into EAX */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				mov edx, rows   	/* initialize ROWS counter */
+				sub edx, 2   	/* do not use first and last row */
+				/* ---, */
+L10320:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				sub ecx, 2   	/* do not use first and last column */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10322:
+			/* ---, */
+			movq mm1, [esi]   	/* load 8 bytes of the image first row */
+			add esi, eax   	/* move one row below */
+				movq mm2, [esi]   	/* load 8 bytes of the image second row */
+			add esi, eax   	/* move one row below */
+				movq mm3, [esi]   	/* load 8 bytes of the image third row */
+			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
+				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
+				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
+				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
+				paddsw mm1, mm3   	/* add 4 words of the third row and result */
+				movq mm2, mm1   	/* copy MM1 into MM2 */
+				psrlq mm1, 32   	/* shift 2 left words to the right */
+				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
+				movq mm3, mm1   	/* copy MM1 into MM3 */
+				psrlq mm1, 16   	/* shift 1 left word to the right */
+				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
+				/* --, */
+				movd mm2, eax   	/* save EAX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm1   	/* copy MM1 into EAX */
+				psraw mm1, 15   	/* spread sign bit of the result */
+				movd edx, mm1   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm1, eax   	/* move result of division into MM1 */
+				packuswb mm1, mm0   	/* pack division result with saturation */
+				movd eax, mm1   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd eax, mm2   	/* restore saved EAX */
+				/* --, */
+				sub esi, eax   	/* move two rows up */
+				sub esi, eax   	/* */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10322    	/* check loop termination, proceed if required */
+				add esi, 2   	/* move to the next row in Src */
+				add edi, 2   	/* move to the next row in Dest */
+				dec              edx    	/* decrease loop counter ROWS */
+				jnz            L10320    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
+			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
+			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
+			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
+			/* --- */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
+			"sub          $2, %%edx \n\t"	/* do not use first and last row */
+			/* --- */
+			".L10320:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"sub          $2, %%ecx \n\t"	/* do not use first and last column */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10322:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the image first row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes of the image second row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm3 \n\t"	/* load 8 bytes of the image third row */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack first 4 bytes into words */
+			"pmullw    %%mm5, %%mm1 \n\t"	/* multiply words first row  image*Kernel */
+			"pmullw    %%mm6, %%mm2 \n\t"	/* multiply words second row image*Kernel */
+			"pmullw    %%mm7, %%mm3 \n\t"	/* multiply words third row  image*Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the first and second rows */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 4 words of the third row and result */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"psrlq       $32, %%mm1 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm1, %%mm3 \n\t"	/* copy MM1 into MM3 */
+			"psrlq       $16, %%mm1 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 1 left and 1 right result words */
+			/* -- */
+			"movd      %%eax, %%mm2 \n\t"	/* save EAX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm1, %%eax \n\t"	/* copy MM1 into EAX */
+			"psraw       $15, %%mm1 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm1, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm1 \n\t"	/* move result of division into MM1 */
+			"packuswb  %%mm0, %%mm1 \n\t"	/* pack division result with saturation */
+			"movd      %%mm1, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"sub       %%eax, %%esi \n\t"	/* move two rows up */
+			"sub       %%eax, %%esi \n\t"	/* */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10322 \n\t"	/* check loop termination, proceed if required */
+			"add          $2, %%esi \n\t"	/* move to the next row in Src */
+			"add          $2, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10320 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel5x5Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >4.
+\param columns Number of columns in source/destination array. Must be >4.
+\param Kernel The 2D convolution kernel of size 5x5.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel5x5Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 5) || (rows < 5) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				movd mm5, ebx   	/* copy Divisor into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 2   	/* 2 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				shl eax, 1   	/* EAX = columns * 2 */
+				add edi, eax   	/* 2 row offset from the top edge */
+				shr eax, 1   	/* EAX = columns */
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 4   	/* do not use first 2 and last 2 rows */
+				/* ---, */
+L10330:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 4   	/* do not use first 2 and last 2 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10332:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				/* ---, */
+				movd mm1, eax   	/* save EDX in MM1 */
+				movd mm2, ebx   	/* save EDX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm7   	/* load summation result into EAX */
+				psraw mm7, 15   	/* spread sign bit of the result */
+				movd ebx, mm5   	/* load Divisor into EBX */
+				movd edx, mm7   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm7, eax   	/* move result of division into MM7 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd ebx, mm2   	/* restore saved EBX */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 72   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10332    	/* check loop termination, proceed if required */
+				add esi, 4   	/* move to the next row in Src */
+				add edi, 4   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10330    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $2, %%edi \n\t"	/* 2 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"shl          $1, %%eax \n\t"	/* EAX = columns * 2 */
+			"add       %%eax, %%edi \n\t"	/* 2 row offset from the top edge */
+			"shr          $1, %%eax \n\t"	/* EAX = columns */
+			"mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $4, %%ebx \n\t"	/* do not use first 2 and last 2 rows */
+			/* --- */
+			".L10330:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $4, %%ecx \n\t"	/* do not use first 2 and last 2 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10332:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			/* --- */
+			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
+			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
+			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
+			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub         $72, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10332 \n\t"	/* check loop termination, proceed if required */
+			"add          $4, %%esi \n\t"	/* move to the next row in Src */
+			"add          $4, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10330 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel7x7Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >6.
+\param columns Number of columns in source/destination array. Must be >6.
+\param Kernel The 2D convolution kernel of size 7x7.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel7x7Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 7) || (rows < 7) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				movd mm5, ebx   	/* copy Divisor into MM5 */
+				mov edx, Kernel  	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 3   	/* 3 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 3 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 6   	/* do not use first 3 and last 3 rows */
+				/* ---, */
+L10340:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 6   	/* do not use first 3 and last 3 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10342:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				/* ---, */
+				movd mm1, eax   	/* save EDX in MM1 */
+				movd mm2, ebx   	/* save EDX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm7   	/* load summation result into EAX */
+				psraw mm7, 15   	/* spread sign bit of the result */
+				movd ebx, mm5   	/* load Divisor into EBX */
+				movd edx, mm7   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm7, eax   	/* move result of division into MM7 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd ebx, mm2   	/* restore saved EBX */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 104   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10342    	/* check loop termination, proceed if required */
+				add esi, 6   	/* move to the next row in Src */
+				add edi, 6   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10340    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $3, %%edi \n\t"	/* 3 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 3 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $6, %%ebx \n\t"	/* do not use first 3 and last 3 rows */
+			/* --- */
+			".L10340:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $6, %%ecx \n\t"	/* do not use first 3 and last 3 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10342:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			/* --- */
+			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
+			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
+			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
+			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $104, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10342 \n\t"	/* check loop termination, proceed if required */
+			"add          $6, %%esi \n\t"	/* move to the next row in Src */
+			"add          $6, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10340 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel9x9Divide: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >8.
+\param columns Number of columns in source/destination array. Must be >8.
+\param Kernel The 2D convolution kernel of size 9x9.
+\param Divisor The divisor of the convolution sum. Must be >0.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel9x9Divide(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+										   signed short *Kernel, unsigned char Divisor)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 9) || (rows < 9) || (Divisor == 0))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, Divisor   	/* load Divisor into BL */
+				movd mm5, ebx   	/* copy Divisor into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 4   	/* 4 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 4 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 8   	/* do not use first 4 and last 4 rows */
+				/* ---, */
+L10350:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 8   	/* do not use first 4 and last 4 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10352:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 8 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 9 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult. 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm3, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				pmullw mm1, mm3   	/* mult. 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				/* ---, */
+				movd mm1, eax   	/* save EDX in MM1 */
+				movd mm2, ebx   	/* save EDX in MM2 */
+				movd mm3, edx   	/* save EDX in MM3 */
+				movd eax, mm7   	/* load summation result into EAX */
+				psraw mm7, 15   	/* spread sign bit of the result */
+				movd ebx, mm5   	/* load Divisor into EBX */
+				movd edx, mm7   	/* fill EDX with a sign bit */
+				idiv bx    	/* IDIV - VERY EXPENSIVE */
+				movd mm7, eax   	/* move result of division into MM7 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd edx, mm3   	/* restore saved EDX */
+				movd ebx, mm2   	/* restore saved EBX */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 208   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10352    	/* check loop termination, proceed if required */
+				add esi, 8   	/* move to the next row in Src */
+				add edi, 8   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10350    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load Divisor into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy Divisor into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $4, %%edi \n\t"	/* 4 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 4 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $8, %%ebx \n\t"	/* do not use first 4 and last 4 rows */
+			/* --- */
+			".L10350:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $8, %%ecx \n\t"	/* do not use first 4 and last 4 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10352:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 8 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 9 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			/* --- */
+			"movd      %%eax, %%mm1 \n\t"	/* save EDX in MM1 */
+			"movd      %%ebx, %%mm2 \n\t"	/* save EDX in MM2 */
+			"movd      %%edx, %%mm3 \n\t"	/* save EDX in MM3 */
+			"movd      %%mm7, %%eax \n\t"	/* load summation result into EAX */
+			"psraw       $15, %%mm7 \n\t"	/* spread sign bit of the result */
+			"movd      %%mm5, %%ebx \n\t"	/* load Divisor into EBX */
+			"movd      %%mm7, %%edx \n\t"	/* fill EDX with a sign bit */
+			"idivw             %%bx \n\t"	/* IDIV - VERY EXPENSIVE */
+			"movd      %%eax, %%mm7 \n\t"	/* move result of division into MM7 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm3, %%edx \n\t"	/* restore saved EDX */
+			"movd      %%mm2, %%ebx \n\t"	/* restore saved EBX */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $208, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10352 \n\t"	/* check loop termination, proceed if required */
+			"add          $8, %%esi \n\t"	/* move to the next row in Src */
+			"add          $8, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10350 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(Divisor)		/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel3x3ShiftRight: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >2.
+\param Kernel The 2D convolution kernel of size 3x3.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel3x3ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 3) || (rows < 3) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm4, ebx   	/* copy NRightShift into MM4 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				movq mm5, [edx]   	/* MM5 = {0,K2,K1,K0} */
+			add edx, 8   	/* second row              |K0 K1 K2 0| */
+				movq mm6, [edx]   	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			add edx, 8   	/* third row               |K6 K7 K8 0| */
+				movq mm7, [edx]   	/* MM7 = {0,K8,K7,K6} */
+			/* ---, */
+			mov eax, columns   	/* load columns into EAX */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				mov edx, rows   	/* initialize ROWS counter */
+				sub edx, 2   	/* do not use first and last row */
+				/* ---, */
+L10360:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				sub ecx, 2   	/* do not use first and last column */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10362:
+			/* ---, */
+			movq mm1, [esi]   	/* load 8 bytes of the image first row */
+			add esi, eax   	/* move one row below */
+				movq mm2, [esi]   	/* load 8 bytes of the image second row */
+			add esi, eax   	/* move one row below */
+				movq mm3, [esi]   	/* load 8 bytes of the image third row */
+			punpcklbw mm1, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm2, mm0   	/* unpack first 4 bytes into words */
+				punpcklbw mm3, mm0   	/* unpack first 4 bytes into words */
+				psrlw mm1, mm4   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm4   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm4   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm5   	/* multiply words first row  image*Kernel */
+				pmullw mm2, mm6   	/* multiply words second row image*Kernel */
+				pmullw mm3, mm7   	/* multiply words third row  image*Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the first and second rows */
+				paddsw mm1, mm3   	/* add 4 words of the third row and result */
+				movq mm2, mm1   	/* copy MM1 into MM2 */
+				psrlq mm1, 32   	/* shift 2 left words to the right */
+				paddsw mm1, mm2   	/* add 2 left and 2 right result words */
+				movq mm3, mm1   	/* copy MM1 into MM3 */
+				psrlq mm1, 16   	/* shift 1 left word to the right */
+				paddsw mm1, mm3   	/* add 1 left and 1 right result words */
+				packuswb mm1, mm0   	/* pack shift result with saturation */
+				movd ebx, mm1   	/* copy saturated result into EBX */
+				mov [edi], bl   	/* copy a byte result into Dest */
+				/* --, */
+				sub esi, eax   	/* move two rows up */
+				sub esi, eax
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10362    	/* check loop termination, proceed if required */
+				add esi, 2   	/* move to the next row in Src */
+				add edi, 2   	/* move to the next row in Dest */
+				dec              edx    	/* decrease loop counter ROWS */
+				jnz            L10360    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm4 \n\t"	/* copy NRightShift into MM4 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"movq    (%%edx), %%mm5 \n\t"	/* MM5 = {0,K2,K1,K0} */
+			"add          $8, %%edx \n\t"	/* second row              |K0 K1 K2 0| */
+			"movq    (%%edx), %%mm6 \n\t"	/* MM6 = {0,K5,K4,K3}  K = |K3 K4 K5 0| */
+			"add          $8, %%edx \n\t"	/* third row               |K6 K7 K8 0| */
+			"movq    (%%edx), %%mm7 \n\t"	/* MM7 = {0,K8,K7,K6} */
+			/* --- */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
+			"sub          $2, %%edx \n\t"	/* do not use first and last row */
+			/* --- */
+			".L10360:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"sub          $2, %%ecx \n\t"	/* do not use first and last column */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10362:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the image first row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes of the image second row */
+			"add       %%eax, %%esi \n\t"	/* move one row below */
+			"movq    (%%esi), %%mm3 \n\t"	/* load 8 bytes of the image third row */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack first 4 bytes into words */
+			"punpcklbw %%mm0, %%mm3 \n\t"	/* unpack first 4 bytes into words */
+			"psrlw     %%mm4, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm4, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm4, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm5, %%mm1 \n\t"	/* multiply words first row  image*Kernel */
+			"pmullw    %%mm6, %%mm2 \n\t"	/* multiply words second row image*Kernel */
+			"pmullw    %%mm7, %%mm3 \n\t"	/* multiply words third row  image*Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the first and second rows */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 4 words of the third row and result */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"psrlq       $32, %%mm1 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm1, %%mm3 \n\t"	/* copy MM1 into MM3 */
+			"psrlq       $16, %%mm1 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm3, %%mm1 \n\t"	/* add 1 left and 1 right result words */
+			"packuswb  %%mm0, %%mm1 \n\t"	/* pack shift result with saturation */
+			"movd      %%mm1, %%ebx \n\t"	/* copy saturated result into EBX */
+			"mov      %%bl, (%%edi) \n\t"	/* copy a byte result into Dest */
+			/* -- */
+			"sub       %%eax, %%esi \n\t"	/* move two rows up */
+			"sub       %%eax, %%esi \n\t" "inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10362 \n\t"	/* check loop termination, proceed if required */
+			"add          $2, %%esi \n\t"	/* move to the next row in Src */
+			"add          $2, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10360 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel5x5ShiftRight: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >4.
+\param columns Number of columns in source/destination array. Must be >4.
+\param Kernel The 2D convolution kernel of size 5x5.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel5x5ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 5) || (rows < 5) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm5, ebx   	/* copy NRightShift into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 2   	/* 2 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				shl eax, 1   	/* EAX = columns * 2 */
+				add edi, eax   	/* 2 row offset from the top edge */
+				shr eax, 1   	/* EAX = columns */
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 4   	/* do not use first 2 and last 2 rows */
+				/* ---, */
+L10370:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 4   	/* do not use first 2 and last 2 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10372:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				movd mm1, eax   	/* save EAX in MM1 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 72   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10372    	/* check loop termination, proceed if required */
+				add esi, 4   	/* move to the next row in Src */
+				add edi, 4   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10370    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $2, %%edi \n\t"	/* 2 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"shl          $1, %%eax \n\t"	/* EAX = columns * 2 */
+			"add       %%eax, %%edi \n\t"	/* 2 row offset from the top edge */
+			"shr          $1, %%eax \n\t"	/* EAX = columns */
+			"mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $4, %%ebx \n\t"	/* do not use first 2 and last 2 rows */
+			/* --- */
+			".L10370:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $4, %%ecx \n\t"	/* do not use first 2 and last 2 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10372:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub         $72, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10372 \n\t"	/* check loop termination, proceed if required */
+			"add          $4, %%esi \n\t"	/* move to the next row in Src */
+			"add          $4, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10370 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel7x7ShiftRight: Dij = saturation0and255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >6.
+\param columns Number of columns in source/destination array. Must be >6.
+\param Kernel The 2D convolution kernel of size 7x7.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel7x7ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 7) || (rows < 7) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm5, ebx   	/* copy NRightShift into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 3   	/* 3 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 3 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 6   	/* do not use first 3 and last 3 rows */
+				/* ---, */
+L10380:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 6   	/* do not use first 3 and last 3 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10382:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				movd mm1, eax   	/* save EAX in MM1 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 104   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10382    	/* check loop termination, proceed if required */
+				add esi, 6   	/* move to the next row in Src */
+				add edi, 6   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10380    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $3, %%edi \n\t"	/* 3 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 3 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $6, %%ebx \n\t"	/* do not use first 3 and last 3 rows */
+			/* --- */
+			".L10380:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $6, %%ecx \n\t"	/* do not use first 3 and last 3 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10382:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $104, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10382 \n\t"	/* check loop termination, proceed if required */
+			"add          $6, %%esi \n\t"	/* move to the next row in Src */
+			"add          $6, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10380 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using ConvolveKernel9x9ShiftRight: Dij = saturation255( ... ) 
+
+\param Src The source 2D byte array to convolve. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >8.
+\param columns Number of columns in source/destination array. Must be >8.
+\param Kernel The 2D convolution kernel of size 9x9.
+\param NRightShift The number of right bit shifts to apply to the convolution sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterConvolveKernel9x9ShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+											   signed short *Kernel, unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL) || (Kernel == NULL))
+		return(-1);
+
+	if ((columns < 9) || (rows < 9) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm5, ebx   	/* copy NRightShift into MM5 */
+				mov edx, Kernel   	/* load Kernel address into EDX */
+				mov esi, Src   	/* load Src  address to ESI */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, 4   	/* 4 column offset from the left edge */
+				mov eax, columns   	/* load columns into EAX */
+				add edi, eax   	/* 4 row offset from the top edge */
+				add edi, eax
+				add edi, eax
+				add edi, eax
+				mov ebx, rows   	/* initialize ROWS counter */
+				sub ebx, 8   	/* do not use first 4 and last 4 rows */
+				/* ---, */
+L10390:
+			mov ecx, eax   	/* initialize COLUMNS counter */
+				sub ecx, 8   	/* do not use first 4 and last 4 columns */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10392:
+			pxor mm7, mm7   	/* zero MM7 (accumulator) */
+				movd mm6, esi   	/* save ESI in MM6 */
+				/* --- 1 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 2 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 3 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 4 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 5 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 6 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 8 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			dec              esi
+				add esi, eax   	/* move Src pointer 1 row below */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* --- 9 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm2, mm1   	/* copy MM1 into MM2 */
+				inc              esi    	/* move pointer to the next 8 bytes of Src */
+				movq mm3, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				movq mm4, [edx]   	/* load 4 words of Kernel */
+			add edx, 8   	/* move pointer to other 4 words */
+				punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				punpckhbw mm2, mm0   	/* unpack second 4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				psrlw mm2, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				pmullw mm2, mm4   	/* mult 4 high words of Src and Kernel */
+				paddsw mm1, mm2   	/* add 4 words of the high and low bytes */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				movq mm1, [esi]   	/* load 8 bytes of the Src */
+			movq mm3, [edx]   	/* load 4 words of Kernel */
+			punpcklbw mm1, mm0   	/* unpack first  4 bytes into words */
+				psrlw mm1, mm5   	/* shift right each pixel NshiftRight times */
+				pmullw mm1, mm3   	/* mult 4 low  words of Src and Kernel */
+				paddsw mm7, mm1   	/* add MM1 to accumulator MM7 */
+				/* ---, */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				paddsw mm7, mm3   	/* add 2 left and 2 right result words */
+				movq mm2, mm7   	/* copy MM7 into MM2 */
+				psrlq mm7, 16   	/* shift 1 left word to the right */
+				paddsw mm7, mm2   	/* add 1 left and 1 right result words */
+				movd mm1, eax   	/* save EAX in MM1 */
+				packuswb mm7, mm0   	/* pack division result with saturation */
+				movd eax, mm7   	/* copy saturated result into EAX */
+				mov [edi], al   	/* copy a byte result into Dest */
+				movd eax, mm1   	/* restore saved EAX */
+				/* --, */
+				movd esi, mm6   	/* move Src pointer to the top pixel */
+				sub edx, 208   	/* EDX = Kernel address */
+				inc              esi    	/* move Src  pointer to the next pixel */
+				inc              edi    	/* move Dest pointer to the next pixel */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10392    	/* check loop termination, proceed if required */
+				add esi, 8   	/* move to the next row in Src */
+				add edi, 8   	/* move to the next row in Dest */
+				dec              ebx    	/* decrease loop counter ROWS */
+				jnz            L10390    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %5, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm5 \n\t"	/* copy NRightShift into MM5 */
+			"mov          %4, %%edx \n\t"	/* load Kernel address into EDX */
+			"mov          %1, %%esi \n\t"	/* load Src  address to ESI */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add          $4, %%edi \n\t"	/* 4 column offset from the left edge */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"add       %%eax, %%edi \n\t"	/* 4 row offset from the top edge */
+			"add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "add       %%eax, %%edi \n\t" "mov          %2, %%ebx \n\t"	/* initialize ROWS counter */
+			"sub          $8, %%ebx \n\t"	/* do not use first 4 and last 4 rows */
+			/* --- */
+			".L10390:               \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMNS counter */
+			"sub          $8, %%ecx \n\t"	/* do not use first 4 and last 4 columns */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10392:               \n\t" "pxor      %%mm7, %%mm7 \n\t"	/* zero MM7 (accumulator) */
+			"movd      %%esi, %%mm6 \n\t"	/* save ESI in MM6 */
+			/* --- 1 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 2 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 3 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 4 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 5 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 6 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 8 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"dec              %%esi \n\t" "add       %%eax, %%esi \n\t"	/* move Src pointer 1 row below */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- 9 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq      %%mm1, %%mm2 \n\t"	/* copy MM1 into MM2 */
+			"inc              %%esi \n\t"	/* move pointer to the next 8 bytes of Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"movq    (%%edx), %%mm4 \n\t"	/* load 4 words of Kernel */
+			"add          $8, %%edx \n\t"	/* move pointer to other 4 words */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"punpckhbw %%mm0, %%mm2 \n\t"	/* unpack second 4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm5, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"pmullw    %%mm4, %%mm2 \n\t"	/* mult. 4 high words of Src and Kernel */
+			"paddsw    %%mm2, %%mm1 \n\t"	/* add 4 words of the high and low bytes */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			"movq    (%%esi), %%mm1 \n\t"	/* load 8 bytes of the Src */
+			"movq    (%%edx), %%mm3 \n\t"	/* load 4 words of Kernel */
+			"punpcklbw %%mm0, %%mm1 \n\t"	/* unpack first  4 bytes into words */
+			"psrlw     %%mm5, %%mm1 \n\t"	/* shift right each pixel NshiftRight times */
+			"pmullw    %%mm3, %%mm1 \n\t"	/* mult. 4 low  words of Src and Kernel */
+			"paddsw    %%mm1, %%mm7 \n\t"	/* add MM1 to accumulator MM7 */
+			/* --- */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"paddsw    %%mm3, %%mm7 \n\t"	/* add 2 left and 2 right result words */
+			"movq      %%mm7, %%mm2 \n\t"	/* copy MM7 into MM2 */
+			"psrlq       $16, %%mm7 \n\t"	/* shift 1 left word to the right */
+			"paddsw    %%mm2, %%mm7 \n\t"	/* add 1 left and 1 right result words */
+			"movd      %%eax, %%mm1 \n\t"	/* save EAX in MM1 */
+			"packuswb  %%mm0, %%mm7 \n\t"	/* pack division result with saturation */
+			"movd      %%mm7, %%eax \n\t"	/* copy saturated result into EAX */
+			"mov      %%al, (%%edi) \n\t"	/* copy a byte result into Dest */
+			"movd      %%mm1, %%eax \n\t"	/* restore saved EAX */
+			/* -- */
+			"movd      %%mm6, %%esi \n\t"	/* move Src pointer to the top pixel */
+			"sub        $208, %%edx \n\t"	/* EDX = Kernel address */
+			"inc              %%esi \n\t"	/* move Src  pointer to the next pixel */
+			"inc              %%edi \n\t"	/* move Dest pointer to the next pixel */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10392 \n\t"	/* check loop termination, proceed if required */
+			"add          $8, %%esi \n\t"	/* move to the next row in Src */
+			"add          $8, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%ebx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10390 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(Kernel),		/* %4 */
+			"m"(NRightShift)	/* %5 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/* ------------------------------------------------------------------------------------ */
+
+/*!
+\brief Filter using SobelX: Dij = saturation255( ... ) 
+
+\param Src The source 2D byte array to sobel-filter. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterSobelX(unsigned char *Src, unsigned char *Dest, int rows, int columns)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL))
+		return(-1);
+
+	if ((columns < 8) || (rows < 3))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				mov eax, columns   	/* load columns into EAX */
+				/* ---, */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				mov edx, rows   	/* initialize ROWS counter */
+				sub edx, 2   	/* do not use first and last rows */
+				/* ---, */
+L10400:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				shr ecx, 3   	/* EBX/8 (MMX loads 8 bytes at a time) */
+				mov ebx, esi   	/* save ESI in EBX */
+				movd mm1, edi   	/* save EDI in MM1 */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10402:
+			/* ---, */
+			movq mm4, [esi]   	/* load 8 bytes from Src */
+			movq mm5, mm4   	/* save MM4 in MM5 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm4, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm5, mm0   	/* unpack 4 high bytes into words */
+				movq mm6, [esi]   	/* load 8 bytes from Src */
+			movq mm7, mm6   	/* save MM6 in MM7 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm6, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm7, mm0   	/* unpack 4 high bytes into words */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				/* ---, */
+				movq mm2, mm4   	/* copy MM4 into MM2 */
+				psrlq mm4, 32   	/* shift 2 left words to the right */
+				psubw mm4, mm2   	/* MM4 = MM4 - MM2 */
+				movq mm3, mm6   	/* copy MM6 into MM3 */
+				psrlq mm6, 32   	/* shift 2 left words to the right */
+				psubw mm6, mm3   	/* MM6 = MM6 - MM3 */
+				punpckldq mm4, mm6   	/* combine 2 words of MM6 and 2 words of MM4 */
+				movq mm2, mm5   	/* copy MM6 into MM2 */
+				psrlq mm5, 32   	/* shift 2 left words to the right */
+				psubw mm5, mm2   	/* MM5 = MM5 - MM2 */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				psubw mm7, mm3   	/* MM7 = MM7 - MM3 */
+				punpckldq mm5, mm7   	/* combine 2 words of MM7 and 2 words of MM5 */
+				/* Take abs values of MM4 and MM5 */
+				movq mm6, mm4   	/* copy MM4 into MM6 */
+				movq mm7, mm5   	/* copy MM5 into MM7 */
+				psraw mm6, 15   	/* fill MM6 words with word sign bit */
+				psraw mm7, 15   	/* fill MM7 words with word sign bit */
+				pxor mm4, mm6   	/* take 1's compliment of only neg words */
+				pxor mm5, mm7   	/* take 1's compliment of only neg words */
+				psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+				psubsw mm5, mm7   	/* add 1 to only neg words, W-(-1) or W-0 */
+				packuswb mm4, mm5   	/* combine and pack/saturate MM5 and MM4 */
+				movq [edi], mm4   	/* store result in Dest */
+				/* ---, */
+				sub esi, eax   	/* move to the current top row in Src */
+				sub esi, eax
+				add esi, 8   	/* move Src  pointer to the next 8 pixels */
+				add edi, 8   	/* move Dest pointer to the next 8 pixels */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10402    	/* check loop termination, proceed if required */
+				mov esi, ebx   	/* restore most left current row Src  address */
+				movd edi, mm1   	/* restore most left current row Dest address */
+				add esi, eax   	/* move to the next row in Src */
+				add edi, eax   	/* move to the next row in Dest */
+				dec              edx    	/* decrease loop counter ROWS */
+				jnz            L10400    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			/* --- */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			"mov          %2, %%edx \n\t"	/* initialize ROWS counter */
+			"sub          $2, %%edx \n\t"	/* do not use first and last rows */
+			/* --- */
+			".L10400:                \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"shr          $3, %%ecx \n\t"	/* EBX/8 (MMX loads 8 bytes at a time) */
+			"mov       %%esi, %%ebx \n\t"	/* save ESI in EBX */
+			"movd      %%edi, %%mm1 \n\t"	/* save EDI in MM1 */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10402:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm4 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm4, %%mm5 \n\t"	/* save MM4 in MM5 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm4 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm5 \n\t"	/* unpack 4 high bytes into words */
+			"movq    (%%esi), %%mm6 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm6, %%mm7 \n\t"	/* save MM6 in MM7 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm6 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm7 \n\t"	/* unpack 4 high bytes into words */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			/* --- */
+			"movq      %%mm4, %%mm2 \n\t"	/* copy MM4 into MM2 */
+			"psrlq       $32, %%mm4 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm4 \n\t"	/* MM4 = MM4 - MM2 */
+			"movq      %%mm6, %%mm3 \n\t"	/* copy MM6 into MM3 */
+			"psrlq       $32, %%mm6 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm6 \n\t"	/* MM6 = MM6 - MM3 */
+			"punpckldq %%mm6, %%mm4 \n\t"	/* combine 2 words of MM6 and 2 words of MM4 */
+			"movq      %%mm5, %%mm2 \n\t"	/* copy MM6 into MM2 */
+			"psrlq       $32, %%mm5 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm5 \n\t"	/* MM5 = MM5 - MM2 */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm7 \n\t"	/* MM7 = MM7 - MM3 */
+			"punpckldq %%mm7, %%mm5 \n\t"	/* combine 2 words of MM7 and 2 words of MM5 */
+			/* Take abs values of MM4 and MM5 */
+			"movq      %%mm4, %%mm6 \n\t"	/* copy MM4 into MM6 */
+			"movq      %%mm5, %%mm7 \n\t"	/* copy MM5 into MM7 */
+			"psraw       $15, %%mm6 \n\t"	/* fill MM6 words with word sign bit */
+			"psraw       $15, %%mm7 \n\t"	/* fill MM7 words with word sign bit */
+			"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
+			"pxor      %%mm7, %%mm5 \n\t"	/* take 1's compliment of only neg. words */
+			"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"psubsw    %%mm7, %%mm5 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"packuswb  %%mm5, %%mm4 \n\t"	/* combine and pack/saturate MM5 and MM4 */
+			"movq    %%mm4, (%%edi) \n\t"	/* store result in Dest */
+			/* --- */
+			"sub       %%eax, %%esi \n\t"	/* move to the current top row in Src */
+			"sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"	/* move Src  pointer to the next 8 pixels */
+			"add $8,          %%edi \n\t"	/* move Dest pointer to the next 8 pixels */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10402 \n\t"	/* check loop termination, proceed if required */
+			"mov       %%ebx, %%esi \n\t"	/* restore most left current row Src  address */
+			"movd      %%mm1, %%edi \n\t"	/* restore most left current row Dest address */
+			"add       %%eax, %%esi \n\t"	/* move to the next row in Src */
+			"add       %%eax, %%edi \n\t"	/* move to the next row in Dest */
+			"dec              %%edx \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10400 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns)		/* %3 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Filter using SobelXShiftRight: Dij = saturation255( ... ) 
+
+\param Src The source 2D byte array to sobel-filter. Should be different from destination.
+\param Dest The destination 2D byte array to store the result in. Should be different from source.
+\param rows Number of rows in source/destination array. Must be >2.
+\param columns Number of columns in source/destination array. Must be >8.
+\param NRightShift The number of right bit shifts to apply to the filter sum. Must be <7.
+
+Note: Non-MMX implementation not available for this function.
+
+\return Returns 1 if filter was applied, 0 otherwise.
+*/
+int SDL_imageFilterSobelXShiftRight(unsigned char *Src, unsigned char *Dest, int rows, int columns,
+									unsigned char NRightShift)
+{
+	/* Validate input parameters */
+	if ((Src == NULL) || (Dest == NULL))
+		return(-1);
+	if ((columns < 8) || (rows < 3) || (NRightShift > 7))
+		return (-1);
+
+	if ((SDL_imageFilterMMXdetect())) {
+//#ifdef USE_MMX
+#if defined(USE_MMX) && defined(i386)
+#if !defined(GCC__)
+		__asm
+		{
+			pusha
+				pxor mm0, mm0   	/* zero MM0 */
+				mov eax, columns   	/* load columns into EAX */
+				xor ebx, ebx   	/* zero EBX */
+				mov bl, NRightShift   	/* load NRightShift into BL */
+				movd mm1, ebx   	/* copy NRightShift into MM1 */
+				/* ---, */
+				mov esi, Src   	/* ESI = Src row 0 address */
+				mov edi, Dest   	/* load Dest address to EDI */
+				add edi, eax   	/* EDI = EDI + columns */
+				inc              edi    	/* 1 byte offset from the left edge */
+				/* initialize ROWS counter */
+				sub rows, 2   	/* do not use first and last rows */
+				/* ---, */
+L10410:
+			mov ecx, eax   	/* initialize COLUMS counter */
+				shr ecx, 3   	/* EBX/8 (MMX loads 8 bytes at a time) */
+				mov ebx, esi   	/* save ESI in EBX */
+				mov edx, edi   	/* save EDI in EDX */
+				align 16                 	/* 16 byte alignment of the loop entry */
+L10412:
+			/* ---, */
+			movq mm4, [esi]   	/* load 8 bytes from Src */
+			movq mm5, mm4   	/* save MM4 in MM5 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm4, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm5, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm4, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm5, mm1   	/* shift right each pixel NshiftRight times */
+				movq mm6, [esi]   	/* load 8 bytes from Src */
+			movq mm7, mm6   	/* save MM6 in MM7 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm6, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm7, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm6, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm7, mm1   	/* shift right each pixel NshiftRight times */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				add esi, eax   	/* move to the next row of Src */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				add esi, 2   	/* move ESI pointer 2 bytes right */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm4, mm2   	/* add 4 low  bytes to accumolator MM4 */
+				paddw mm5, mm3   	/* add 4 high bytes to accumolator MM5 */
+				movq mm2, [esi]   	/* load 8 bytes from Src */
+			movq mm3, mm2   	/* save MM2 in MM3 */
+				sub esi, 2   	/* move ESI pointer back 2 bytes left */
+				punpcklbw mm2, mm0   	/* unpack 4 low  bytes into words */
+				punpckhbw mm3, mm0   	/* unpack 4 high bytes into words */
+				psrlw mm2, mm1   	/* shift right each pixel NshiftRight times */
+				psrlw mm3, mm1   	/* shift right each pixel NshiftRight times */
+				paddw mm6, mm2   	/* add 4 low  bytes to accumolator MM6 */
+				paddw mm7, mm3   	/* add 4 high bytes to accumolator MM7 */
+				/* ---, */
+				movq mm2, mm4   	/* copy MM4 into MM2 */
+				psrlq mm4, 32   	/* shift 2 left words to the right */
+				psubw mm4, mm2   	/* MM4 = MM4 - MM2 */
+				movq mm3, mm6   	/* copy MM6 into MM3 */
+				psrlq mm6, 32   	/* shift 2 left words to the right */
+				psubw mm6, mm3   	/* MM6 = MM6 - MM3 */
+				punpckldq mm4, mm6   	/* combine 2 words of MM6 and 2 words of MM4 */
+				movq mm2, mm5   	/* copy MM6 into MM2 */
+				psrlq mm5, 32   	/* shift 2 left words to the right */
+				psubw mm5, mm2   	/* MM5 = MM5 - MM2 */
+				movq mm3, mm7   	/* copy MM7 into MM3 */
+				psrlq mm7, 32   	/* shift 2 left words to the right */
+				psubw mm7, mm3   	/* MM7 = MM7 - MM3 */
+				punpckldq mm5, mm7   	/* combine 2 words of MM7 and 2 words of MM5 */
+				/* Take abs values of MM4 and MM5 */
+				movq mm6, mm4   	/* copy MM4 into MM6 */
+				movq mm7, mm5   	/* copy MM5 into MM7 */
+				psraw mm6, 15   	/* fill MM6 words with word sign bit */
+				psraw mm7, 15   	/* fill MM7 words with word sign bit */
+				pxor mm4, mm6   	/* take 1's compliment of only neg words */
+				pxor mm5, mm7   	/* take 1's compliment of only neg words */
+				psubsw mm4, mm6   	/* add 1 to only neg words, W-(-1) or W-0 */
+				psubsw mm5, mm7   	/* add 1 to only neg words, W-(-1) or W-0 */
+				packuswb mm4, mm5   	/* combine and pack/saturate MM5 and MM4 */
+				movq [edi], mm4   	/* store result in Dest */
+				/* ---, */
+				sub esi, eax   	/* move to the current top row in Src */
+				sub esi, eax
+				add esi, 8   	/* move Src  pointer to the next 8 pixels */
+				add edi, 8   	/* move Dest pointer to the next 8 pixels */
+				/* ---, */
+				dec              ecx    	/* decrease loop counter COLUMNS */
+				jnz            L10412    	/* check loop termination, proceed if required */
+				mov esi, ebx   	/* restore most left current row Src  address */
+				mov edi, edx   	/* restore most left current row Dest address */
+				add esi, eax   	/* move to the next row in Src */
+				add edi, eax   	/* move to the next row in Dest */
+				dec rows    	/* decrease loop counter ROWS */
+				jnz            L10410    	/* check loop termination, proceed if required */
+				/* ---, */
+				emms                      	/* exit MMX state */
+				popa
+		}
+#else
+		asm volatile
+			("pusha		     \n\t" "pxor      %%mm0, %%mm0 \n\t"	/* zero MM0 */
+			"mov          %3, %%eax \n\t"	/* load columns into EAX */
+			"xor       %%ebx, %%ebx \n\t"	/* zero EBX */
+			"mov           %4, %%bl \n\t"	/* load NRightShift into BL */
+			"movd      %%ebx, %%mm1 \n\t"	/* copy NRightShift into MM1 */
+			/* --- */
+			"mov          %1, %%esi \n\t"	/* ESI = Src row 0 address */
+			"mov          %0, %%edi \n\t"	/* load Dest address to EDI */
+			"add       %%eax, %%edi \n\t"	/* EDI = EDI + columns */
+			"inc              %%edi \n\t"	/* 1 byte offset from the left edge */
+			/* initialize ROWS counter */
+			"subl            $2, %2 \n\t"	/* do not use first and last rows */
+			/* --- */
+			".L10410:                \n\t" "mov       %%eax, %%ecx \n\t"	/* initialize COLUMS counter */
+			"shr          $3, %%ecx \n\t"	/* EBX/8 (MMX loads 8 bytes at a time) */
+			"mov       %%esi, %%ebx \n\t"	/* save ESI in EBX */
+			"mov       %%edi, %%edx \n\t"	/* save EDI in EDX */
+			".align 16              \n\t"	/* 16 byte alignment of the loop entry */
+			".L10412:               \n\t"
+			/* --- */
+			"movq    (%%esi), %%mm4 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm4, %%mm5 \n\t"	/* save MM4 in MM5 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm4 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm5 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm4 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm5 \n\t"	/* shift right each pixel NshiftRight times */
+			"movq    (%%esi), %%mm6 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm6, %%mm7 \n\t"	/* save MM6 in MM7 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm6 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm7 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm6 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm7 \n\t"	/* shift right each pixel NshiftRight times */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			"add       %%eax, %%esi \n\t"	/* move to the next row of Src */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"add          $2, %%esi \n\t"	/* move ESI pointer 2 bytes right */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm4 \n\t"	/* add 4 low  bytes to accumolator MM4 */
+			"paddw     %%mm3, %%mm5 \n\t"	/* add 4 high bytes to accumolator MM5 */
+			"movq    (%%esi), %%mm2 \n\t"	/* load 8 bytes from Src */
+			"movq      %%mm2, %%mm3 \n\t"	/* save MM2 in MM3 */
+			"sub          $2, %%esi \n\t"	/* move ESI pointer back 2 bytes left */
+			"punpcklbw %%mm0, %%mm2 \n\t"	/* unpack 4 low  bytes into words */
+			"punpckhbw %%mm0, %%mm3 \n\t"	/* unpack 4 high bytes into words */
+			"psrlw     %%mm1, %%mm2 \n\t"	/* shift right each pixel NshiftRight times */
+			"psrlw     %%mm1, %%mm3 \n\t"	/* shift right each pixel NshiftRight times */
+			"paddw     %%mm2, %%mm6 \n\t"	/* add 4 low  bytes to accumolator MM6 */
+			"paddw     %%mm3, %%mm7 \n\t"	/* add 4 high bytes to accumolator MM7 */
+			/* --- */
+			"movq      %%mm4, %%mm2 \n\t"	/* copy MM4 into MM2 */
+			"psrlq       $32, %%mm4 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm4 \n\t"	/* MM4 = MM4 - MM2 */
+			"movq      %%mm6, %%mm3 \n\t"	/* copy MM6 into MM3 */
+			"psrlq       $32, %%mm6 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm6 \n\t"	/* MM6 = MM6 - MM3 */
+			"punpckldq %%mm6, %%mm4 \n\t"	/* combine 2 words of MM6 and 2 words of MM4 */
+			"movq      %%mm5, %%mm2 \n\t"	/* copy MM6 into MM2 */
+			"psrlq       $32, %%mm5 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm2, %%mm5 \n\t"	/* MM5 = MM5 - MM2 */
+			"movq      %%mm7, %%mm3 \n\t"	/* copy MM7 into MM3 */
+			"psrlq       $32, %%mm7 \n\t"	/* shift 2 left words to the right */
+			"psubw     %%mm3, %%mm7 \n\t"	/* MM7 = MM7 - MM3 */
+			"punpckldq %%mm7, %%mm5 \n\t"	/* combine 2 words of MM7 and 2 words of MM5 */
+			/* Take abs values of MM4 and MM5 */
+			"movq      %%mm4, %%mm6 \n\t"	/* copy MM4 into MM6 */
+			"movq      %%mm5, %%mm7 \n\t"	/* copy MM5 into MM7 */
+			"psraw       $15, %%mm6 \n\t"	/* fill MM6 words with word sign bit */
+			"psraw       $15, %%mm7 \n\t"	/* fill MM7 words with word sign bit */
+			"pxor      %%mm6, %%mm4 \n\t"	/* take 1's compliment of only neg. words */
+			"pxor      %%mm7, %%mm5 \n\t"	/* take 1's compliment of only neg. words */
+			"psubsw    %%mm6, %%mm4 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"psubsw    %%mm7, %%mm5 \n\t"	/* add 1 to only neg. words, W-(-1) or W-0 */
+			"packuswb  %%mm5, %%mm4 \n\t"	/* combine and pack/saturate MM5 and MM4 */
+			"movq    %%mm4, (%%edi) \n\t"	/* store result in Dest */
+			/* --- */
+			"sub       %%eax, %%esi \n\t"	/* move to the current top row in Src */
+			"sub       %%eax, %%esi \n\t" "add $8,          %%esi \n\t"	/* move Src  pointer to the next 8 pixels */
+			"add $8,          %%edi \n\t"	/* move Dest pointer to the next 8 pixels */
+			/* --- */
+			"dec              %%ecx \n\t"	/* decrease loop counter COLUMNS */
+			"jnz            .L10412 \n\t"	/* check loop termination, proceed if required */
+			"mov       %%ebx, %%esi \n\t"	/* restore most left current row Src  address */
+			"mov       %%edx, %%edi \n\t"	/* restore most left current row Dest address */
+			"add       %%eax, %%esi \n\t"	/* move to the next row in Src */
+			"add       %%eax, %%edi \n\t"	/* move to the next row in Dest */
+			"decl                %2 \n\t"	/* decrease loop counter ROWS */
+			"jnz            .L10410 \n\t"	/* check loop termination, proceed if required */
+			/* --- */
+			"emms                   \n\t"	/* exit MMX state */
+			"popa                   \n\t":"=m" (Dest)	/* %0 */
+			:"m"(Src),		/* %1 */
+			"m"(rows),		/* %2 */
+			"m"(columns),		/* %3 */
+			"m"(NRightShift)	/* %4 */
+			);
+#endif
+#endif
+		return (0);
+	} else {
+		/* No non-MMX implementation yet */
+		return (-1);
+	}
+}
+
+/*!
+\brief Align stack to 32 byte boundary,
+*/
+void SDL_imageFilterAlignStack(void)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{				/* --- stack alignment --- */
+		mov ebx, esp   	/* load ESP into EBX */
+			sub ebx, 4   	/* reserve space on stack for old value of ESP */
+			and ebx, -32   	/* align EBX along a 32 byte boundary */
+			mov [ebx], esp   	/* save old value of ESP in stack, behind the bndry */
+			mov esp, ebx   	/* align ESP along a 32 byte boundary */
+	}
+#else
+	asm volatile
+		(				/* --- stack alignment --- */
+		"mov       %%esp, %%ebx \n\t"	/* load ESP into EBX */
+		"sub          $4, %%ebx \n\t"	/* reserve space on stack for old value of ESP */
+		"and        $-32, %%ebx \n\t"	/* align EBX along a 32 byte boundary */
+		"mov     %%esp, (%%ebx) \n\t"	/* save old value of ESP in stack, behind the bndry */
+		"mov       %%ebx, %%esp \n\t"	/* align ESP along a 32 byte boundary */
+		::);
+#endif
+#endif
+}
+
+/*!
+\brief Restore previously aligned stack.
+*/
+void SDL_imageFilterRestoreStack(void)
+{
+#ifdef USE_MMX
+#if !defined(GCC__)
+	__asm
+	{				/* --- restoring old stack --- */
+		mov ebx, [esp]   	/* load old value of ESP */
+		mov esp, ebx   	/* restore old value of ESP */
+	}
+#else
+	asm volatile
+		(				/* --- restoring old stack --- */
+		"mov     (%%esp), %%ebx \n\t"	/* load old value of ESP */
+		"mov       %%ebx, %%esp \n\t"	/* restore old value of ESP */
+		::);
+#endif
+#endif
+}
diff --git a/src/gfx/SDL_rotozoom.c b/src/gfx/SDL_rotozoom.c
new file mode 100644
index 0000000..53fc90c
--- /dev/null
+++ b/src/gfx/SDL_rotozoom.c
@@ -0,0 +1,1717 @@
+/*  
+
+SDL_rotozoom.c: rotozoomer, zoomer and shrinker for 32bit or 8bit surfaces
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+
+*/
+
+#ifdef WIN32
+#include <windows.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "SDL_rotozoom.h"
+
+/* ---- Internally used structures */
+
+/*!
+\brief A 32 bit RGBA pixel.
+*/
+typedef struct tColorRGBA {
+	Uint8 r;
+	Uint8 g;
+	Uint8 b;
+	Uint8 a;
+} tColorRGBA;
+
+/*!
+\brief A 8bit Y/palette pixel.
+*/
+typedef struct tColorY {
+	Uint8 y;
+} tColorY;
+
+/*! 
+\brief Returns maximum of two numbers a and b.
+*/
+#define MAX(a,b)    (((a) > (b)) ? (a) : (b))
+
+/*! 
+\brief Number of guard rows added to destination surfaces.
+
+This is a simple but effective workaround for observed issues.
+These rows allocate extra memory and are then hidden from the surface.
+Rows are added to the end of destination surfaces when they are allocated. 
+This catches any potential overflows which seem to happen with 
+just the right src image dimensions and scale/rotation and can lead
+to a situation where the program can segfault.
+*/
+#define GUARD_ROWS (2)
+
+/*!
+\brief Lower limit of absolute zoom factor or rotation degrees.
+*/
+#define VALUE_LIMIT	0.001
+
+/*!
+\brief Returns colorkey info for a surface
+*/
+Uint32 _colorkey(SDL_Surface *src)
+{
+	Uint32 key = 0; 
+#if (SDL_MINOR_VERSION == 3)
+	SDL_GetColorKey(src, &key);
+#else
+	if (src) 
+	{
+		key = src->format->colorkey;
+	}
+#endif
+	return key;
+}
+
+
+/*! 
+\brief Internal 32 bit integer-factor averaging Shrinker.
+
+Shrinks 32 bit RGBA/ABGR 'src' surface to 'dst' surface.
+Averages color and alpha values values of src pixels to calculate dst pixels.
+Assumes src and dst surfaces are of 32 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to shrink (input).
+\param dst The shrunken surface (output).
+\param factorx The horizontal shrinking ratio.
+\param factory The vertical shrinking ratio.
+
+\return 0 for success or -1 for error.
+*/
+int _shrinkSurfaceRGBA(SDL_Surface * src, SDL_Surface * dst, int factorx, int factory)
+{
+	int x, y, dx, dy, sgap, dgap, ra, ga, ba, aa;
+	int n_average;
+	tColorRGBA *sp, *osp, *oosp;
+	tColorRGBA *dp;
+
+	/*
+	* Averaging integer shrink
+	*/
+
+	/* Precalculate division factor */
+	n_average = factorx*factory;
+
+	/*
+	* Scan destination
+	*/
+	sp = (tColorRGBA *) src->pixels;
+	sgap = src->pitch - src->w * 4;
+
+	dp = (tColorRGBA *) dst->pixels;
+	dgap = dst->pitch - dst->w * 4;
+
+	for (y = 0; y < dst->h; y++) {
+
+		osp=sp;
+		for (x = 0; x < dst->w; x++) {
+
+			/* Trace out source box and accumulate */
+			oosp=sp;
+			ra=ga=ba=aa=0;
+			for (dy=0; dy < factory; dy++) {
+				for (dx=0; dx < factorx; dx++) {
+					ra += sp->r;
+					ga += sp->g;
+					ba += sp->b;
+					aa += sp->a;
+
+					sp++;
+				} 
+				/* src dx loop */
+				sp = (tColorRGBA *)((Uint8*)sp + (src->pitch - 4*factorx)); // next y
+			}
+			/* src dy loop */
+
+			/* next box-x */
+			sp = (tColorRGBA *)((Uint8*)oosp + 4*factorx);
+
+			/* Store result in destination */
+			dp->r = ra/n_average;
+			dp->g = ga/n_average;
+			dp->b = ba/n_average;
+			dp->a = aa/n_average;
+
+			/*
+			* Advance destination pointer 
+			*/
+			dp++;
+		} 
+		/* dst x loop */
+
+		/* next box-y */
+		sp = (tColorRGBA *)((Uint8*)osp + src->pitch*factory);
+
+		/*
+		* Advance destination pointers 
+		*/
+		dp = (tColorRGBA *) ((Uint8 *) dp + dgap);
+	} 
+	/* dst y loop */
+
+	return (0);
+}
+
+/*! 
+\brief Internal 8 bit integer-factor averaging shrinker.
+
+Shrinks 8bit Y 'src' surface to 'dst' surface.
+Averages color (brightness) values values of src pixels to calculate dst pixels.
+Assumes src and dst surfaces are of 8 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to shrink (input).
+\param dst The shrunken surface (output).
+\param factorx The horizontal shrinking ratio.
+\param factory The vertical shrinking ratio.
+
+\return 0 for success or -1 for error.
+*/
+int _shrinkSurfaceY(SDL_Surface * src, SDL_Surface * dst, int factorx, int factory)
+{
+	int x, y, dx, dy, sgap, dgap, a;
+	int n_average;
+	Uint8 *sp, *osp, *oosp;
+	Uint8 *dp;
+
+	/*
+	* Averaging integer shrink
+	*/
+
+	/* Precalculate division factor */
+	n_average = factorx*factory;
+
+	/*
+	* Scan destination
+	*/
+	sp = (Uint8 *) src->pixels;
+	sgap = src->pitch - src->w;
+
+	dp = (Uint8 *) dst->pixels;
+	dgap = dst->pitch - dst->w;
+
+	for (y = 0; y < dst->h; y++) {    
+
+		osp=sp;
+		for (x = 0; x < dst->w; x++) {
+
+			/* Trace out source box and accumulate */
+			oosp=sp;
+			a=0;
+			for (dy=0; dy < factory; dy++) {
+				for (dx=0; dx < factorx; dx++) {
+					a += (*sp);
+					/* next x */           
+					sp++;
+				} 
+				/* end src dx loop */         
+				/* next y */
+				sp = (Uint8 *)((Uint8*)sp + (src->pitch - factorx)); 
+			} 
+			/* end src dy loop */
+
+			/* next box-x */
+			sp = (Uint8 *)((Uint8*)oosp + factorx);
+
+			/* Store result in destination */
+			*dp = a/n_average;
+
+			/*
+			* Advance destination pointer 
+			*/
+			dp++;
+		} 
+		/* end dst x loop */
+
+		/* next box-y */
+		sp = (Uint8 *)((Uint8*)osp + src->pitch*factory);
+
+		/*
+		* Advance destination pointers 
+		*/
+		dp = (Uint8 *)((Uint8 *)dp + dgap);
+	} 
+	/* end dst y loop */
+
+	return (0);
+}
+
+/*! 
+\brief Internal 32 bit Zoomer with optional anti-aliasing by bilinear interpolation.
+
+Zooms 32 bit RGBA/ABGR 'src' surface to 'dst' surface.
+Assumes src and dst surfaces are of 32 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to zoom (input).
+\param dst The zoomed surface (output).
+\param flipx Flag indicating if the image should be horizontally flipped.
+\param flipy Flag indicating if the image should be vertically flipped.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return 0 for success or -1 for error.
+*/
+int _zoomSurfaceRGBA(SDL_Surface * src, SDL_Surface * dst, int flipx, int flipy, int smooth)
+{
+	int x, y, sx, sy, ssx, ssy, *sax, *say, *csax, *csay, *salast, csx, csy, ex, ey, cx, cy, sstep, sstepx, sstepy;
+	tColorRGBA *c00, *c01, *c10, *c11;
+	tColorRGBA *sp, *csp, *dp;
+	int spixelgap, spixelw, spixelh, dgap, t1, t2;
+
+	/*
+	* Allocate memory for row/column increments 
+	*/
+	if ((sax = (int *) malloc((dst->w + 1) * sizeof(Uint32))) == NULL) {
+		return (-1);
+	}
+	if ((say = (int *) malloc((dst->h + 1) * sizeof(Uint32))) == NULL) {
+		free(sax);
+		return (-1);
+	}
+
+	/*
+	* Precalculate row increments 
+	*/
+	spixelw = (src->w - 1);
+	spixelh = (src->h - 1);
+	if (smooth) {
+		sx = (int) (65536.0 * (float) spixelw / (float) (dst->w - 1));
+		sy = (int) (65536.0 * (float) spixelh / (float) (dst->h - 1));
+	} else {
+		sx = (int) (65536.0 * (float) (src->w) / (float) (dst->w));
+		sy = (int) (65536.0 * (float) (src->h) / (float) (dst->h));
+	}
+
+	/* Maximum scaled source size */
+	ssx = (src->w << 16) - 1;
+	ssy = (src->h << 16) - 1;
+
+	/* Precalculate horizontal row increments */
+	csx = 0;
+	csax = sax;
+	for (x = 0; x <= dst->w; x++) {
+		*csax = csx;
+		csax++;
+		csx += sx;
+
+		/* Guard from overflows */
+		if (csx > ssx) { 
+			csx = ssx; 
+		}
+	}
+
+	/* Precalculate vertical row increments */
+	csy = 0;
+	csay = say;
+	for (y = 0; y <= dst->h; y++) {
+		*csay = csy;
+		csay++;
+		csy += sy;
+
+		/* Guard from overflows */
+		if (csy > ssy) {
+			csy = ssy;
+		}
+	}
+
+	sp = (tColorRGBA *) src->pixels;
+	dp = (tColorRGBA *) dst->pixels;
+	dgap = dst->pitch - dst->w * 4;
+	spixelgap = src->pitch/4;
+
+	if (flipx) sp += spixelw;
+	if (flipy) sp += (spixelgap * spixelh);
+
+	/*
+	* Switch between interpolating and non-interpolating code 
+	*/
+	if (smooth) {
+
+		/*
+		* Interpolating Zoom 
+		*/
+		csay = say;
+		for (y = 0; y < dst->h; y++) {
+			csp = sp;
+			csax = sax;
+			for (x = 0; x < dst->w; x++) {
+				/*
+				* Setup color source pointers 
+				*/
+				ex = (*csax & 0xffff);
+				ey = (*csay & 0xffff);
+				cx = (*csax >> 16);
+				cy = (*csay >> 16);
+				sstepx = cx < spixelw;
+				sstepy = cy < spixelh;
+				c00 = sp;
+				c01 = sp;
+				c10 = sp;
+				if (sstepy) {
+					if (flipy) {
+						c10 -= spixelgap;
+					} else {
+						c10 += spixelgap;
+					}
+				}
+				c11 = c10;
+				if (sstepx) {
+					if (flipx) {
+						c01--;
+						c11--;
+					} else {
+						c01++;
+						c11++;
+					}
+				}
+
+				/*
+				* Draw and interpolate colors 
+				*/
+				t1 = ((((c01->r - c00->r) * ex) >> 16) + c00->r) & 0xff;
+				t2 = ((((c11->r - c10->r) * ex) >> 16) + c10->r) & 0xff;
+				dp->r = (((t2 - t1) * ey) >> 16) + t1;
+				t1 = ((((c01->g - c00->g) * ex) >> 16) + c00->g) & 0xff;
+				t2 = ((((c11->g - c10->g) * ex) >> 16) + c10->g) & 0xff;
+				dp->g = (((t2 - t1) * ey) >> 16) + t1;
+				t1 = ((((c01->b - c00->b) * ex) >> 16) + c00->b) & 0xff;
+				t2 = ((((c11->b - c10->b) * ex) >> 16) + c10->b) & 0xff;
+				dp->b = (((t2 - t1) * ey) >> 16) + t1;
+				t1 = ((((c01->a - c00->a) * ex) >> 16) + c00->a) & 0xff;
+				t2 = ((((c11->a - c10->a) * ex) >> 16) + c10->a) & 0xff;
+				dp->a = (((t2 - t1) * ey) >> 16) + t1;				
+				/*
+				* Advance source pointer x
+				*/
+				salast = csax;
+				csax++;				
+				sstep = (*csax >> 16) - (*salast >> 16);
+				if (flipx) {
+					sp -= sstep;
+				} else {
+					sp += sstep;
+				}
+
+				/*
+				* Advance destination pointer x
+				*/
+				dp++;
+			}
+			/*
+			* Advance source pointer y
+			*/
+			salast = csay;
+			csay++;
+			sstep = (*csay >> 16) - (*salast >> 16);
+			sstep *= spixelgap;
+			if (flipy) { 
+				sp = csp - sstep;
+			} else {
+				sp = csp + sstep;
+			}
+
+			/*
+			* Advance destination pointer y
+			*/
+			dp = (tColorRGBA *) ((Uint8 *) dp + dgap);
+		}
+	} else {
+		/*
+		* Non-Interpolating Zoom 
+		*/		
+		csay = say;
+		for (y = 0; y < dst->h; y++) {
+			csp = sp;
+			csax = sax;
+			for (x = 0; x < dst->w; x++) {
+				/*
+				* Draw 
+				*/
+				*dp = *sp;
+
+				/*
+				* Advance source pointer x
+				*/
+				salast = csax;
+				csax++;				
+				sstep = (*csax >> 16) - (*salast >> 16);
+				if (flipx) sstep = -sstep;
+				sp += sstep;
+
+				/*
+				* Advance destination pointer x
+				*/
+				dp++;
+			}
+			/*
+			* Advance source pointer y
+			*/
+			salast = csay;
+			csay++;
+			sstep = (*csay >> 16) - (*salast >> 16);
+			sstep *= spixelgap;
+			if (flipy) sstep = -sstep;			
+			sp = csp + sstep;
+
+			/*
+			* Advance destination pointer y
+			*/
+			dp = (tColorRGBA *) ((Uint8 *) dp + dgap);
+		}
+	}
+
+	/*
+	* Remove temp arrays 
+	*/
+	free(sax);
+	free(say);
+
+	return (0);
+}
+
+/*! 
+
+\brief Internal 8 bit Zoomer without smoothing.
+
+Zooms 8bit palette/Y 'src' surface to 'dst' surface.
+Assumes src and dst surfaces are of 8 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src The surface to zoom (input).
+\param dst The zoomed surface (output).
+\param flipx Flag indicating if the image should be horizontally flipped.
+\param flipy Flag indicating if the image should be vertically flipped.
+
+\return 0 for success or -1 for error.
+*/
+int _zoomSurfaceY(SDL_Surface * src, SDL_Surface * dst, int flipx, int flipy)
+{
+	int x, y;
+	Uint32 *sax, *say, *csax, *csay;
+	int csx, csy;
+	Uint8 *sp, *dp, *csp;
+	int dgap;
+
+	/*
+	* Allocate memory for row increments 
+	*/
+	if ((sax = (Uint32 *) malloc((dst->w + 1) * sizeof(Uint32))) == NULL) {
+		return (-1);
+	}
+	if ((say = (Uint32 *) malloc((dst->h + 1) * sizeof(Uint32))) == NULL) {
+		free(sax);
+		return (-1);
+	}
+
+	/*
+	* Pointer setup 
+	*/
+	sp = csp = (Uint8 *) src->pixels;
+	dp = (Uint8 *) dst->pixels;
+	dgap = dst->pitch - dst->w;
+
+	if (flipx) csp += (src->w-1);
+	if (flipy) csp  = ( (Uint8*)csp + src->pitch*(src->h-1) );
+
+	/*
+	* Precalculate row increments 
+	*/
+	csx = 0;
+	csax = sax;
+	for (x = 0; x < dst->w; x++) {
+		csx += src->w;
+		*csax = 0;
+		while (csx >= dst->w) {
+			csx -= dst->w;
+			(*csax)++;
+		}
+		(*csax) = (*csax) * (flipx ? -1 : 1);
+		csax++;
+	}
+	csy = 0;
+	csay = say;
+	for (y = 0; y < dst->h; y++) {
+		csy += src->h;
+		*csay = 0;
+		while (csy >= dst->h) {
+			csy -= dst->h;
+			(*csay)++;
+		}
+		(*csay) = (*csay) * (flipy ? -1 : 1);
+		csay++;
+	}
+
+	/*
+	* Draw 
+	*/
+	csay = say;
+	for (y = 0; y < dst->h; y++) {
+		csax = sax;
+		sp = csp;
+		for (x = 0; x < dst->w; x++) {
+			/*
+			* Draw 
+			*/
+			*dp = *sp;
+			/*
+			* Advance source pointers 
+			*/
+			sp += (*csax);
+			csax++;
+			/*
+			* Advance destination pointer 
+			*/
+			dp++;
+		}
+		/*
+		* Advance source pointer (for row) 
+		*/
+		csp += ((*csay) * src->pitch);
+		csay++;
+
+		/*
+		* Advance destination pointers 
+		*/
+		dp += dgap;
+	}
+
+	/*
+	* Remove temp arrays 
+	*/
+	free(sax);
+	free(say);
+
+	return (0);
+}
+
+/*! 
+\brief Internal 32 bit rotozoomer with optional anti-aliasing.
+
+Rotates and zooms 32 bit RGBA/ABGR 'src' surface to 'dst' surface based on the control 
+parameters by scanning the destination surface and applying optionally anti-aliasing
+by bilinear interpolation.
+Assumes src and dst surfaces are of 32 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src Source surface.
+\param dst Destination surface.
+\param cx Horizontal center coordinate.
+\param cy Vertical center coordinate.
+\param isin Integer version of sine of angle.
+\param icos Integer version of cosine of angle.
+\param flipx Flag indicating horizontal mirroring should be applied.
+\param flipy Flag indicating vertical mirroring should be applied.
+\param smooth Flag indicating anti-aliasing should be used.
+*/
+void _transformSurfaceRGBA(SDL_Surface * src, SDL_Surface * dst, int cx, int cy, int isin, int icos, int flipx, int flipy, int smooth)
+{
+	int x, y, t1, t2, dx, dy, xd, yd, sdx, sdy, ax, ay, ex, ey, sw, sh;
+	tColorRGBA c00, c01, c10, c11, cswap;
+	tColorRGBA *pc, *sp;
+	int gap;
+
+	/*
+	* Variable setup 
+	*/
+	xd = ((src->w - dst->w) << 15);
+	yd = ((src->h - dst->h) << 15);
+	ax = (cx << 16) - (icos * cx);
+	ay = (cy << 16) - (isin * cx);
+	sw = src->w - 1;
+	sh = src->h - 1;
+	pc = (tColorRGBA*) dst->pixels;
+	gap = dst->pitch - dst->w * 4;
+
+	/*
+	* Switch between interpolating and non-interpolating code 
+	*/
+	if (smooth) {
+		for (y = 0; y < dst->h; y++) {
+			dy = cy - y;
+			sdx = (ax + (isin * dy)) + xd;
+			sdy = (ay - (icos * dy)) + yd;
+			for (x = 0; x < dst->w; x++) {
+				dx = (sdx >> 16);
+				dy = (sdy >> 16);
+				if (flipx) dx = sw - dx;
+				if (flipy) dy = sh - dy;
+				if ((dx > -1) && (dy > -1) && (dx < (src->w-1)) && (dy < (src->h-1))) {
+					sp = (tColorRGBA *)src->pixels;;
+					sp += ((src->pitch/4) * dy);
+					sp += dx;
+					c00 = *sp;
+					sp += 1;
+					c01 = *sp;
+					sp += (src->pitch/4);
+					c11 = *sp;
+					sp -= 1;
+					c10 = *sp;
+					if (flipx) {
+						cswap = c00; c00=c01; c01=cswap;
+						cswap = c10; c10=c11; c11=cswap;
+					}
+					if (flipy) {
+						cswap = c00; c00=c10; c10=cswap;
+						cswap = c01; c01=c11; c11=cswap;
+					}
+					/*
+					* Interpolate colors 
+					*/
+					ex = (sdx & 0xffff);
+					ey = (sdy & 0xffff);
+					t1 = ((((c01.r - c00.r) * ex) >> 16) + c00.r) & 0xff;
+					t2 = ((((c11.r - c10.r) * ex) >> 16) + c10.r) & 0xff;
+					pc->r = (((t2 - t1) * ey) >> 16) + t1;
+					t1 = ((((c01.g - c00.g) * ex) >> 16) + c00.g) & 0xff;
+					t2 = ((((c11.g - c10.g) * ex) >> 16) + c10.g) & 0xff;
+					pc->g = (((t2 - t1) * ey) >> 16) + t1;
+					t1 = ((((c01.b - c00.b) * ex) >> 16) + c00.b) & 0xff;
+					t2 = ((((c11.b - c10.b) * ex) >> 16) + c10.b) & 0xff;
+					pc->b = (((t2 - t1) * ey) >> 16) + t1;
+					t1 = ((((c01.a - c00.a) * ex) >> 16) + c00.a) & 0xff;
+					t2 = ((((c11.a - c10.a) * ex) >> 16) + c10.a) & 0xff;
+					pc->a = (((t2 - t1) * ey) >> 16) + t1;
+				}
+				sdx += icos;
+				sdy += isin;
+				pc++;
+			}
+			pc = (tColorRGBA *) ((Uint8 *) pc + gap);
+		}
+	} else {
+		for (y = 0; y < dst->h; y++) {
+			dy = cy - y;
+			sdx = (ax + (isin * dy)) + xd;
+			sdy = (ay - (icos * dy)) + yd;
+			for (x = 0; x < dst->w; x++) {
+				dx = (short) (sdx >> 16);
+				dy = (short) (sdy >> 16);
+				if (flipx) dx = (src->w-1)-dx;
+				if (flipy) dy = (src->h-1)-dy;
+				if ((dx >= 0) && (dy >= 0) && (dx < src->w) && (dy < src->h)) {
+					sp = (tColorRGBA *) ((Uint8 *) src->pixels + src->pitch * dy);
+					sp += dx;
+					*pc = *sp;
+				}
+				sdx += icos;
+				sdy += isin;
+				pc++;
+			}
+			pc = (tColorRGBA *) ((Uint8 *) pc + gap);
+		}
+	}
+}
+
+/*!
+
+\brief Rotates and zooms 8 bit palette/Y 'src' surface to 'dst' surface without smoothing.
+
+Rotates and zooms 8 bit RGBA/ABGR 'src' surface to 'dst' surface based on the control 
+parameters by scanning the destination surface.
+Assumes src and dst surfaces are of 8 bit depth.
+Assumes dst surface was allocated with the correct dimensions.
+
+\param src Source surface.
+\param dst Destination surface.
+\param cx Horizontal center coordinate.
+\param cy Vertical center coordinate.
+\param isin Integer version of sine of angle.
+\param icos Integer version of cosine of angle.
+\param flipx Flag indicating horizontal mirroring should be applied.
+\param flipy Flag indicating vertical mirroring should be applied.
+*/
+void transformSurfaceY(SDL_Surface * src, SDL_Surface * dst, int cx, int cy, int isin, int icos, int flipx, int flipy)
+{
+	int x, y, dx, dy, xd, yd, sdx, sdy, ax, ay, sw, sh;
+	tColorY *pc, *sp;
+	int gap;
+
+	/*
+	* Variable setup 
+	*/
+	xd = ((src->w - dst->w) << 15);
+	yd = ((src->h - dst->h) << 15);
+	ax = (cx << 16) - (icos * cx);
+	ay = (cy << 16) - (isin * cx);
+	sw = src->w - 1;
+	sh = src->h - 1;
+	pc = (tColorY*) dst->pixels;
+	gap = dst->pitch - dst->w;
+	/*
+	* Clear surface to colorkey 
+	*/ 	
+	memset(pc, (int)(_colorkey(src) & 0xff), dst->pitch * dst->h);
+	/*
+	* Iterate through destination surface 
+	*/
+	for (y = 0; y < dst->h; y++) {
+		dy = cy - y;
+		sdx = (ax + (isin * dy)) + xd;
+		sdy = (ay - (icos * dy)) + yd;
+		for (x = 0; x < dst->w; x++) {
+			dx = (short) (sdx >> 16);
+			dy = (short) (sdy >> 16);
+			if (flipx) dx = (src->w-1)-dx;
+			if (flipy) dy = (src->h-1)-dy;
+			if ((dx >= 0) && (dy >= 0) && (dx < src->w) && (dy < src->h)) {
+				sp = (tColorY *) (src->pixels);
+				sp += (src->pitch * dy + dx);
+				*pc = *sp;
+			}
+			sdx += icos;
+			sdy += isin;
+			pc++;
+		}
+		pc += gap;
+	}
+}
+
+/*!
+\brief Rotates a 32 bit surface in increments of 90 degrees.
+
+Specialized 90 degree rotator which rotates a 'src' surface in 90 degree 
+increments clockwise returning a new surface. Faster than rotozoomer since
+not scanning or interpolation takes place. Input surface must be 32 bit.
+(code contributed by J. Schiller, improved by C. Allport and A. Schiffler)
+
+\param src Source surface to rotate.
+\param numClockwiseTurns Number of clockwise 90 degree turns to apply to the source.
+
+\returns The new, rotated surface; or NULL for surfaces with incorrect input format.
+*/
+SDL_Surface* rotateSurface90Degrees(SDL_Surface* src, int numClockwiseTurns) 
+{
+	int row, col, newWidth, newHeight;
+	int bpp, src_ipr, dst_ipr;
+	SDL_Surface* dst;
+	Uint32* srcBuf;
+	Uint32* dstBuf;
+
+	/* Has to be a valid surface pointer and only 32-bit surfaces (for now) */
+	if (!src || src->format->BitsPerPixel != 32) { return NULL; }
+
+	/* normalize numClockwiseTurns */
+	while(numClockwiseTurns < 0) { numClockwiseTurns += 4; }
+	numClockwiseTurns = (numClockwiseTurns % 4);
+
+	/* if it's even, our new width will be the same as the source surface */
+	newWidth = (numClockwiseTurns % 2) ? (src->h) : (src->w);
+	newHeight = (numClockwiseTurns % 2) ? (src->w) : (src->h);
+	dst = SDL_CreateRGBSurface( src->flags, newWidth, newHeight, src->format->BitsPerPixel,
+		src->format->Rmask,
+		src->format->Gmask, 
+		src->format->Bmask, 
+		src->format->Amask);
+	if(!dst) {
+		return NULL;
+	}
+
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_LockSurface(dst);
+	}
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_LockSurface(dst);
+	}
+
+	/* Calculate int-per-row */
+	bpp = src->format->BitsPerPixel / 8;
+	src_ipr = src->pitch / bpp;
+	dst_ipr = dst->pitch / bpp;
+
+	switch(numClockwiseTurns) {
+	case 0: /* Make a copy of the surface */
+		{
+			/* Unfortunately SDL_BlitSurface cannot be used to make a copy of the surface
+			since it does not preserve alpha. */
+
+			if (src->pitch == dst->pitch) {
+				/* If the pitch is the same for both surfaces, the memory can be copied all at once. */
+				memcpy(dst->pixels, src->pixels, (src->h * src->pitch));
+			}
+			else
+			{
+				/* If the pitch differs, copy each row separately */
+				srcBuf = (Uint32*)(src->pixels); 
+				dstBuf = (Uint32*)(dst->pixels);
+				for (row = 0; row < src->h; row++) {
+					memcpy(dstBuf, srcBuf, dst->w * bpp);
+					srcBuf += src_ipr;
+					dstBuf += dst_ipr;
+				} /* end for(col) */
+			} /* end for(row) */
+		}
+		break;
+
+		/* rotate clockwise */
+	case 1: /* rotated 90 degrees clockwise */
+		{
+			for (row = 0; row < src->h; ++row) {
+				srcBuf = (Uint32*)(src->pixels) + (row * src_ipr);
+				dstBuf = (Uint32*)(dst->pixels) + (dst->w - row - 1);
+				for (col = 0; col < src->w; ++col) {
+					*dstBuf = *srcBuf;
+					++srcBuf;
+					dstBuf += dst_ipr;
+				} 
+				/* end for(col) */
+			} 
+			/* end for(row) */
+		}
+		break;
+
+	case 2: /* rotated 180 degrees clockwise */
+		{
+			for (row = 0; row < src->h; ++row) {
+				srcBuf = (Uint32*)(src->pixels) + (row * src_ipr);
+				dstBuf = (Uint32*)(dst->pixels) + ((dst->h - row - 1) * dst_ipr) + (dst->w - 1);
+				for (col = 0; col < src->w; ++col) {
+					*dstBuf = *srcBuf;
+					++srcBuf;
+					--dstBuf;
+				} 
+			} 
+		}
+		break;
+
+	case 3:
+		{
+			for (row = 0; row < src->h; ++row) {
+				srcBuf = (Uint32*)(src->pixels) + (row * src_ipr);
+				dstBuf = (Uint32*)(dst->pixels) + row + ((dst->h - 1) * dst_ipr);
+				for (col = 0; col < src->w; ++col) {
+					*dstBuf = *srcBuf;
+					++srcBuf;
+					dstBuf -= dst_ipr;
+				} 
+			} 
+		}
+		break;
+	} 
+	/* end switch */
+
+	if (SDL_MUSTLOCK(src)) {
+		SDL_UnlockSurface(src);
+	}
+	if (SDL_MUSTLOCK(dst)) {
+		SDL_UnlockSurface(dst);
+	}
+
+	return dst;
+}
+
+
+/*!
+\brief Internal target surface sizing function for rotozooms with trig result return. 
+
+\param width The source surface width.
+\param height The source surface height.
+\param angle The angle to rotate in degrees.
+\param zoomx The horizontal scaling factor.
+\param zoomy The vertical scaling factor.
+\param dstwidth The calculated width of the destination surface.
+\param dstheight The calculated height of the destination surface.
+\param canglezoom The sine of the angle adjusted by the zoom factor.
+\param sanglezoom The cosine of the angle adjusted by the zoom factor.
+
+*/
+void _rotozoomSurfaceSizeTrig(int width, int height, double angle, double zoomx, double zoomy, 
+	int *dstwidth, int *dstheight, 
+	double *canglezoom, double *sanglezoom)
+{
+	double x, y, cx, cy, sx, sy;
+	double radangle;
+	int dstwidthhalf, dstheighthalf;
+
+	/*
+	* Determine destination width and height by rotating a centered source box 
+	*/
+	radangle = angle * (M_PI / 180.0);
+	*sanglezoom = sin(radangle);
+	*canglezoom = cos(radangle);
+	*sanglezoom *= zoomx;
+	*canglezoom *= zoomx;
+	x = (double)(width / 2);
+	y = (double)(height / 2);
+	cx = *canglezoom * x;
+	cy = *canglezoom * y;
+	sx = *sanglezoom * x;
+	sy = *sanglezoom * y;
+
+	dstwidthhalf = MAX((int)
+		ceil(MAX(MAX(MAX(fabs(cx + sy), fabs(cx - sy)), fabs(-cx + sy)), fabs(-cx - sy))), 1);
+	dstheighthalf = MAX((int)
+		ceil(MAX(MAX(MAX(fabs(sx + cy), fabs(sx - cy)), fabs(-sx + cy)), fabs(-sx - cy))), 1);
+	*dstwidth = 2 * dstwidthhalf;
+	*dstheight = 2 * dstheighthalf;
+}
+
+/*! 
+\brief Returns the size of the resulting target surface for a rotozoomSurfaceXY() call. 
+
+\param width The source surface width.
+\param height The source surface height.
+\param angle The angle to rotate in degrees.
+\param zoomx The horizontal scaling factor.
+\param zoomy The vertical scaling factor.
+\param dstwidth The calculated width of the rotozoomed destination surface.
+\param dstheight The calculated height of the rotozoomed destination surface.
+*/
+void rotozoomSurfaceSizeXY(int width, int height, double angle, double zoomx, double zoomy, int *dstwidth, int *dstheight)
+{
+	double dummy_sanglezoom, dummy_canglezoom;
+
+	_rotozoomSurfaceSizeTrig(width, height, angle, zoomx, zoomy, dstwidth, dstheight, &dummy_sanglezoom, &dummy_canglezoom);
+}
+
+/*! 
+\brief Returns the size of the resulting target surface for a rotozoomSurface() call. 
+
+\param width The source surface width.
+\param height The source surface height.
+\param angle The angle to rotate in degrees.
+\param zoom The scaling factor.
+\param dstwidth The calculated width of the rotozoomed destination surface.
+\param dstheight The calculated height of the rotozoomed destination surface.
+*/
+void rotozoomSurfaceSize(int width, int height, double angle, double zoom, int *dstwidth, int *dstheight)
+{
+	double dummy_sanglezoom, dummy_canglezoom;
+
+	_rotozoomSurfaceSizeTrig(width, height, angle, zoom, zoom, dstwidth, dstheight, &dummy_sanglezoom, &dummy_canglezoom);
+}
+
+/*!
+\brief Rotates and zooms a surface and optional anti-aliasing. 
+
+Rotates and zoomes a 32bit or 8bit 'src' surface to newly created 'dst' surface.
+'angle' is the rotation in degrees and 'zoom' a scaling factor. If 'smooth' is set
+then the destination 32bit surface is anti-aliased. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+
+\param src The surface to rotozoom.
+\param angle The angle to rotate in degrees.
+\param zoom The scaling factor.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return The new rotozoomed surface.
+*/
+SDL_Surface *rotozoomSurface(SDL_Surface * src, double angle, double zoom, int smooth)
+{
+	return rotozoomSurfaceXY(src, angle, zoom, zoom, smooth);
+}
+
+/*!
+\brief Rotates and zooms a surface with different horizontal and vertival scaling factors and optional anti-aliasing. 
+
+Rotates and zooms a 32bit or 8bit 'src' surface to newly created 'dst' surface.
+'angle' is the rotation in degrees, 'zoomx and 'zoomy' scaling factors. If 'smooth' is set
+then the destination 32bit surface is anti-aliased. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+
+\param src The surface to rotozoom.
+\param angle The angle to rotate in degrees.
+\param zoomx The horizontal scaling factor.
+\param zoomy The vertical scaling factor.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return The new rotozoomed surface.
+*/
+SDL_Surface *rotozoomSurfaceXY(SDL_Surface * src, double angle, double zoomx, double zoomy, int smooth)
+{
+	SDL_Surface *rz_src;
+	SDL_Surface *rz_dst;
+	double zoominv;
+	double sanglezoom, canglezoom, sanglezoominv, canglezoominv;
+	int dstwidthhalf, dstwidth, dstheighthalf, dstheight;
+	int is32bit;
+	int i, src_converted;
+	int flipx,flipy;
+	Uint8 r,g,b;
+	Uint32 colorkey = 0;
+	int colorKeyAvailable = 0;
+
+	/*
+	* Sanity check 
+	*/
+	if (src == NULL)
+		return (NULL);
+
+	if (src->flags & SDL_SRCCOLORKEY)
+	{
+		colorkey = _colorkey(src);
+		SDL_GetRGB(colorkey, src->format, &r, &g, &b);
+		colorKeyAvailable = 1;
+	}
+	/*
+	* Determine if source surface is 32bit or 8bit 
+	*/
+	is32bit = (src->format->BitsPerPixel == 32);
+	if ((is32bit) || (src->format->BitsPerPixel == 8)) {
+		/*
+		* Use source surface 'as is' 
+		*/
+		rz_src = src;
+		src_converted = 0;
+	} else {
+		/*
+		* New source surface is 32bit with a defined RGBA ordering 
+		*/
+		rz_src =
+			SDL_CreateRGBSurface(SDL_SWSURFACE, src->w, src->h, 32, 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+			0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000
+#else
+			0xff000000,  0x00ff0000, 0x0000ff00, 0x000000ff
+#endif
+			);
+		if(colorKeyAvailable)
+			SDL_SetColorKey(src, 0, 0);
+
+		SDL_BlitSurface(src, NULL, rz_src, NULL);
+
+		if(colorKeyAvailable)
+			SDL_SetColorKey(src, SDL_SRCCOLORKEY, colorkey);
+		src_converted = 1;
+		is32bit = 1;
+	}
+
+	/*
+	* Sanity check zoom factor 
+	*/
+	flipx = (zoomx<0.0);
+	if (flipx) zoomx=-zoomx;
+	flipy = (zoomy<0.0);
+	if (flipy) zoomy=-zoomy;
+	if (zoomx < VALUE_LIMIT) zoomx = VALUE_LIMIT;
+	if (zoomy < VALUE_LIMIT) zoomy = VALUE_LIMIT;
+	zoominv = 65536.0 / (zoomx * zoomx);
+
+	/*
+	* Check if we have a rotozoom or just a zoom 
+	*/
+	if (fabs(angle) > VALUE_LIMIT) {
+
+		/*
+		* Angle!=0: full rotozoom 
+		*/
+		/*
+		* ----------------------- 
+		*/
+
+		/* Determine target size */
+		_rotozoomSurfaceSizeTrig(rz_src->w, rz_src->h, angle, zoomx, zoomy, &dstwidth, &dstheight, &canglezoom, &sanglezoom);
+
+		/*
+		* Calculate target factors from sin/cos and zoom 
+		*/
+		sanglezoominv = sanglezoom;
+		canglezoominv = canglezoom;
+		sanglezoominv *= zoominv;
+		canglezoominv *= zoominv;
+
+		/* Calculate half size */
+		dstwidthhalf = dstwidth / 2;
+		dstheighthalf = dstheight / 2;
+
+		/*
+		* Alloc space to completely contain the rotated surface 
+		*/
+		rz_dst = NULL;
+		if (is32bit) {
+			/*
+			* Target surface is 32bit with source RGBA/ABGR ordering 
+			*/
+			rz_dst =
+				SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 32,
+				rz_src->format->Rmask, rz_src->format->Gmask,
+				rz_src->format->Bmask, rz_src->format->Amask);
+		} else {
+			/*
+			* Target surface is 8bit 
+			*/
+			rz_dst = SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 8, 0, 0, 0, 0);
+		}
+
+		/* Check target */
+		if (rz_dst == NULL)
+			return NULL;
+
+		/* Adjust for guard rows */
+		rz_dst->h = dstheight;
+
+		if (colorKeyAvailable == 1){
+			colorkey = SDL_MapRGB(rz_dst->format, r, g, b);
+
+			SDL_FillRect(rz_dst, NULL, colorkey );
+		}
+
+		/*
+		* Lock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_LockSurface(rz_src);
+		}
+
+		/*
+		* Check which kind of surface we have 
+		*/
+		if (is32bit) {
+			/*
+			* Call the 32bit transformation routine to do the rotation (using alpha) 
+			*/
+			_transformSurfaceRGBA(rz_src, rz_dst, dstwidthhalf, dstheighthalf,
+				(int) (sanglezoominv), (int) (canglezoominv), 
+				flipx, flipy,
+				smooth);
+			/*
+			* Turn on source-alpha support 
+			*/
+			SDL_SetAlpha(rz_dst, SDL_SRCALPHA, 255);
+			SDL_SetColorKey(rz_dst, SDL_SRCCOLORKEY | SDL_RLEACCEL, _colorkey(rz_src));
+		} else {
+			/*
+			* Copy palette and colorkey info 
+			*/
+			for (i = 0; i < rz_src->format->palette->ncolors; i++) {
+				rz_dst->format->palette->colors[i] = rz_src->format->palette->colors[i];
+			}
+			rz_dst->format->palette->ncolors = rz_src->format->palette->ncolors;
+			/*
+			* Call the 8bit transformation routine to do the rotation 
+			*/
+			transformSurfaceY(rz_src, rz_dst, dstwidthhalf, dstheighthalf,
+				(int) (sanglezoominv), (int) (canglezoominv),
+				flipx, flipy);
+			SDL_SetColorKey(rz_dst, SDL_SRCCOLORKEY | SDL_RLEACCEL, _colorkey(rz_src));
+		}
+		/*
+		* Unlock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_UnlockSurface(rz_src);
+		}
+
+	} else {
+
+		/*
+		* Angle=0: Just a zoom 
+		*/
+		/*
+		* -------------------- 
+		*/
+
+		/*
+		* Calculate target size
+		*/
+		zoomSurfaceSize(rz_src->w, rz_src->h, zoomx, zoomy, &dstwidth, &dstheight);
+
+		/*
+		* Alloc space to completely contain the zoomed surface 
+		*/
+		rz_dst = NULL;
+		if (is32bit) {
+			/*
+			* Target surface is 32bit with source RGBA/ABGR ordering 
+			*/
+			rz_dst =
+				SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 32,
+				rz_src->format->Rmask, rz_src->format->Gmask,
+				rz_src->format->Bmask, rz_src->format->Amask);
+		} else {
+			/*
+			* Target surface is 8bit 
+			*/
+			rz_dst = SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 8, 0, 0, 0, 0);
+		}
+
+		/* Check target */
+		if (rz_dst == NULL)
+			return NULL;
+
+		/* Adjust for guard rows */
+		rz_dst->h = dstheight;
+
+		if (colorKeyAvailable == 1){
+			colorkey = SDL_MapRGB(rz_dst->format, r, g, b);
+
+			SDL_FillRect(rz_dst, NULL, colorkey );
+		}
+
+		/*
+		* Lock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_LockSurface(rz_src);
+		}
+
+		/*
+		* Check which kind of surface we have 
+		*/
+		if (is32bit) {
+			/*
+			* Call the 32bit transformation routine to do the zooming (using alpha) 
+			*/
+			_zoomSurfaceRGBA(rz_src, rz_dst, flipx, flipy, smooth);
+
+			/*
+			* Turn on source-alpha support 
+			*/
+			SDL_SetAlpha(rz_dst, SDL_SRCALPHA, 255);
+			SDL_SetColorKey(rz_dst, SDL_SRCCOLORKEY | SDL_RLEACCEL, _colorkey(rz_src));
+		} else {
+			/*
+			* Copy palette and colorkey info 
+			*/
+			for (i = 0; i < rz_src->format->palette->ncolors; i++) {
+				rz_dst->format->palette->colors[i] = rz_src->format->palette->colors[i];
+			}
+			rz_dst->format->palette->ncolors = rz_src->format->palette->ncolors;
+
+			/*
+			* Call the 8bit transformation routine to do the zooming 
+			*/
+			_zoomSurfaceY(rz_src, rz_dst, flipx, flipy);
+			SDL_SetColorKey(rz_dst, SDL_SRCCOLORKEY | SDL_RLEACCEL, _colorkey(rz_src));
+		}
+
+		/*
+		* Unlock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_UnlockSurface(rz_src);
+		}
+	}
+
+	/*
+	* Cleanup temp surface 
+	*/
+	if (src_converted) {
+		SDL_FreeSurface(rz_src);
+	}
+
+	/*
+	* Return destination surface 
+	*/
+	return (rz_dst);
+}
+
+/*!
+\brief Calculates the size of the target surface for a zoomSurface() call.
+
+The minimum size of the target surface is 1. The input factors can be positive or negative.
+
+\param width The width of the source surface to zoom.
+\param height The height of the source surface to zoom.
+\param zoomx The horizontal zoom factor.
+\param zoomy The vertical zoom factor.
+\param dstwidth Pointer to an integer to store the calculated width of the zoomed target surface.
+\param dstheight Pointer to an integer to store the calculated height of the zoomed target surface.
+*/
+void zoomSurfaceSize(int width, int height, double zoomx, double zoomy, int *dstwidth, int *dstheight)
+{
+	/*
+	* Make zoom factors positive 
+	*/
+	int flipx, flipy;
+	flipx = (zoomx<0.0);
+	if (flipx) zoomx = -zoomx;
+	flipy = (zoomy<0.0);
+	if (flipy) zoomy = -zoomy;
+
+	/*
+	* Sanity check zoom factors 
+	*/
+	if (zoomx < VALUE_LIMIT) {
+		zoomx = VALUE_LIMIT;
+	}
+	if (zoomy < VALUE_LIMIT) {
+		zoomy = VALUE_LIMIT;
+	}
+
+	/*
+	* Calculate target size 
+	*/
+	*dstwidth = (int) floor(((double) width * zoomx) + 0.5);
+	*dstheight = (int) floor(((double) height * zoomy) + 0.5);
+	if (*dstwidth < 1) {
+		*dstwidth = 1;
+	}
+	if (*dstheight < 1) {
+		*dstheight = 1;
+	}
+}
+
+/*! 
+\brief Zoom a surface by independent horizontal and vertical factors with optional smoothing.
+
+Zooms a 32bit or 8bit 'src' surface to newly created 'dst' surface.
+'zoomx' and 'zoomy' are scaling factors for width and height. If 'smooth' is on
+then the destination 32bit surface is anti-aliased. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+If zoom factors are negative, the image is flipped on the axes.
+
+\param src The surface to zoom.
+\param zoomx The horizontal zoom factor.
+\param zoomy The vertical zoom factor.
+\param smooth Antialiasing flag; set to SMOOTHING_ON to enable.
+
+\return The new, zoomed surface.
+*/
+SDL_Surface *zoomSurface(SDL_Surface * src, double zoomx, double zoomy, int smooth)
+{
+	SDL_Surface *rz_src;
+	SDL_Surface *rz_dst;
+	int dstwidth, dstheight;
+	int is32bit;
+	int i, src_converted;
+	int flipx, flipy;
+
+	/*
+	* Sanity check 
+	*/
+	if (src == NULL)
+		return (NULL);
+
+	/*
+	* Determine if source surface is 32bit or 8bit 
+	*/
+	is32bit = (src->format->BitsPerPixel == 32);
+	if ((is32bit) || (src->format->BitsPerPixel == 8)) {
+		/*
+		* Use source surface 'as is' 
+		*/
+		rz_src = src;
+		src_converted = 0;
+	} else {
+		/*
+		* New source surface is 32bit with a defined RGBA ordering 
+		*/
+		rz_src =
+			SDL_CreateRGBSurface(SDL_SWSURFACE, src->w, src->h, 32, 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+			0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000
+#else
+			0xff000000,  0x00ff0000, 0x0000ff00, 0x000000ff
+#endif
+			);
+		if (rz_src == NULL) {
+			return NULL;
+		}
+		SDL_BlitSurface(src, NULL, rz_src, NULL);
+		src_converted = 1;
+		is32bit = 1;
+	}
+
+	flipx = (zoomx<0.0);
+	if (flipx) zoomx = -zoomx;
+	flipy = (zoomy<0.0);
+	if (flipy) zoomy = -zoomy;
+
+	/* Get size if target */
+	zoomSurfaceSize(rz_src->w, rz_src->h, zoomx, zoomy, &dstwidth, &dstheight);
+
+	/*
+	* Alloc space to completely contain the zoomed surface 
+	*/
+	rz_dst = NULL;
+	if (is32bit) {
+		/*
+		* Target surface is 32bit with source RGBA/ABGR ordering 
+		*/
+		rz_dst =
+			SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 32,
+			rz_src->format->Rmask, rz_src->format->Gmask,
+			rz_src->format->Bmask, rz_src->format->Amask);
+	} else {
+		/*
+		* Target surface is 8bit 
+		*/
+		rz_dst = SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 8, 0, 0, 0, 0);
+	}
+
+	/* Check target */
+	if (rz_dst == NULL) {
+		/*
+		* Cleanup temp surface 
+		*/
+		if (src_converted) {
+			SDL_FreeSurface(rz_src);
+		}		
+		return NULL;
+	}
+
+	/* Adjust for guard rows */
+	rz_dst->h = dstheight;
+
+	/*
+	* Lock source surface 
+	*/
+	if (SDL_MUSTLOCK(rz_src)) {
+		SDL_LockSurface(rz_src);
+	}
+
+	/*
+	* Check which kind of surface we have 
+	*/
+	if (is32bit) {
+		/*
+		* Call the 32bit transformation routine to do the zooming (using alpha) 
+		*/
+		_zoomSurfaceRGBA(rz_src, rz_dst, flipx, flipy, smooth);
+		/*
+		* Turn on source-alpha support 
+		*/
+		SDL_SetAlpha(rz_dst, SDL_SRCALPHA, 255);
+	} else {
+		/*
+		* Copy palette and colorkey info 
+		*/
+		for (i = 0; i < rz_src->format->palette->ncolors; i++) {
+			rz_dst->format->palette->colors[i] = rz_src->format->palette->colors[i];
+		}
+		rz_dst->format->palette->ncolors = rz_src->format->palette->ncolors;
+		/*
+		* Call the 8bit transformation routine to do the zooming 
+		*/
+		_zoomSurfaceY(rz_src, rz_dst, flipx, flipy);
+		SDL_SetColorKey(rz_dst, SDL_SRCCOLORKEY | SDL_RLEACCEL, _colorkey(rz_src));
+	}
+	/*
+	* Unlock source surface 
+	*/
+	if (SDL_MUSTLOCK(rz_src)) {
+		SDL_UnlockSurface(rz_src);
+	}
+
+	/*
+	* Cleanup temp surface 
+	*/
+	if (src_converted) {
+		SDL_FreeSurface(rz_src);
+	}
+
+	/*
+	* Return destination surface 
+	*/
+	return (rz_dst);
+}
+
+/*! 
+\brief Shrink a surface by an integer ratio using averaging.
+
+Shrinks a 32bit or 8bit 'src' surface to a newly created 'dst' surface.
+'factorx' and 'factory' are the shrinking ratios (i.e. 2=1/2 the size,
+3=1/3 the size, etc.) The destination surface is antialiased by averaging
+the source box RGBA or Y information. If the surface is not 8bit
+or 32bit RGBA/ABGR it will be converted into a 32bit RGBA format on the fly.
+The input surface is not modified. The output surface is newly allocated.
+
+\param src The surface to shrink.
+\param factorx The horizontal shrinking ratio.
+\param factory The vertical shrinking ratio.
+
+\return The new, shrunken surface.
+*/
+/*@null@*/ 
+SDL_Surface *shrinkSurface(SDL_Surface *src, int factorx, int factory)
+{
+	int result;
+	SDL_Surface *rz_src;
+	SDL_Surface *rz_dst = NULL;
+	int dstwidth, dstheight;
+	int is32bit;
+	int i, src_converted;
+	int haveError = 0;
+
+	/*
+	* Sanity check 
+	*/
+	if (src == NULL) {
+		return (NULL);
+	}
+
+	/*
+	* Determine if source surface is 32bit or 8bit 
+	*/
+	is32bit = (src->format->BitsPerPixel == 32);
+	if ((is32bit) || (src->format->BitsPerPixel == 8)) {
+		/*
+		* Use source surface 'as is' 
+		*/
+		rz_src = src;
+		src_converted = 0;
+	} else {
+		/*
+		* New source surface is 32bit with a defined RGBA ordering 
+		*/
+		rz_src = SDL_CreateRGBSurface(SDL_SWSURFACE, src->w, src->h, 32, 
+#if SDL_BYTEORDER == SDL_LIL_ENDIAN
+			0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000
+#else
+			0xff000000,  0x00ff0000, 0x0000ff00, 0x000000ff
+#endif
+			);
+		if (rz_src==NULL) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+
+		SDL_BlitSurface(src, NULL, rz_src, NULL);
+		src_converted = 1;
+		is32bit = 1;
+	}
+
+	/*
+	* Lock the surface 
+	*/
+	if (SDL_MUSTLOCK(rz_src)) {
+		if (SDL_LockSurface(rz_src) < 0) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+	}
+
+	/* Get size for target */
+	dstwidth=rz_src->w/factorx;
+	while (dstwidth*factorx>rz_src->w) { dstwidth--; }
+	dstheight=rz_src->h/factory;
+	while (dstheight*factory>rz_src->h) { dstheight--; }
+
+	/*
+	* Alloc space to completely contain the shrunken surface
+	* (with added guard rows)
+	*/
+	if (is32bit==1) {
+		/*
+		* Target surface is 32bit with source RGBA/ABGR ordering 
+		*/
+		rz_dst =
+			SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 32,
+			rz_src->format->Rmask, rz_src->format->Gmask,
+			rz_src->format->Bmask, rz_src->format->Amask);
+	} else {
+		/*
+		* Target surface is 8bit 
+		*/
+		rz_dst = SDL_CreateRGBSurface(SDL_SWSURFACE, dstwidth, dstheight + GUARD_ROWS, 8, 0, 0, 0, 0);
+	}
+
+	/* Check target */
+	if (rz_dst == NULL) {
+		haveError = 1;
+		goto exitShrinkSurface;
+	}
+
+	/* Adjust for guard rows */
+	rz_dst->h = dstheight;
+
+	/*
+	* Check which kind of surface we have 
+	*/
+	if (is32bit==1) {
+		/*
+		* Call the 32bit transformation routine to do the shrinking (using alpha) 
+		*/
+		result = _shrinkSurfaceRGBA(rz_src, rz_dst, factorx, factory);		
+		if ((result!=0) || (rz_dst==NULL)) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+
+		/*
+		* Turn on source-alpha support 
+		*/
+		result = SDL_SetAlpha(rz_dst, SDL_SRCALPHA, 255);
+		if (result!=0) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+	} else {
+		/*
+		* Copy palette and colorkey info 
+		*/
+		for (i = 0; i < rz_src->format->palette->ncolors; i++) {
+			rz_dst->format->palette->colors[i] = rz_src->format->palette->colors[i];
+		}
+		rz_dst->format->palette->ncolors = rz_src->format->palette->ncolors;
+		/*
+		* Call the 8bit transformation routine to do the shrinking 
+		*/
+		result = _shrinkSurfaceY(rz_src, rz_dst, factorx, factory);
+		if (result!=0) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}
+
+		/*
+		* Set colorkey on target
+		*/
+		result = SDL_SetColorKey(rz_dst, SDL_SRCCOLORKEY | SDL_RLEACCEL, _colorkey(rz_src));
+		if (result!=0) {
+			haveError = 1;
+			goto exitShrinkSurface;
+		}		
+	}
+
+exitShrinkSurface:
+	if (rz_src!=NULL) {
+		/*
+		* Unlock source surface 
+		*/
+		if (SDL_MUSTLOCK(rz_src)) {
+			SDL_UnlockSurface(rz_src);
+		}
+
+		/*
+		* Cleanup temp surface 
+		*/
+		if (src_converted==1) {
+			SDL_FreeSurface(rz_src);
+		}
+	}
+
+	/* Check error state; maybe need to cleanup destination */
+	if (haveError==1) {
+		if (rz_dst!=NULL) {
+			SDL_FreeSurface(rz_dst);
+		}
+		rz_dst=NULL;
+	} 
+
+	/*
+	* Return destination surface 
+	*/
+	return (rz_dst);
+}
diff --git a/src/joystick/dummy/SDL_sysjoystick.c b/src/joystick/dummy/SDL_sysjoystick.c
deleted file mode 100644
index 3a1aae7..0000000
--- a/src/joystick/dummy/SDL_sysjoystick.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-#if defined(SDL_JOYSTICK_DUMMY) || defined(SDL_JOYSTICK_DISABLED)
-
-/* This is the system specific header for the SDL joystick API */
-
-#include "SDL_joystick.h"
-#include "../SDL_sysjoystick.h"
-#include "../SDL_joystick_c.h"
-
-/* Function to scan the system for joysticks.
- * This function should set SDL_numjoysticks to the number of available
- * joysticks.  Joystick 0 should be the system default joystick.
- * It should return 0, or -1 on an unrecoverable fatal error.
- */
-int SDL_SYS_JoystickInit(void)
-{
-	SDL_numjoysticks = 0;
-	return(0);
-}
-
-/* Function to get the device-dependent name of a joystick */
-const char *SDL_SYS_JoystickName(int index)
-{
-	SDL_SetError("Logic error: No joysticks available");
-	return(NULL);
-}
-
-/* Function to open a joystick for use.
-   The joystick to open is specified by the index field of the joystick.
-   This should fill the nbuttons and naxes fields of the joystick structure.
-   It returns 0, or -1 if there is an error.
- */
-int SDL_SYS_JoystickOpen(SDL_Joystick *joystick)
-{
-	SDL_SetError("Logic error: No joysticks available");
-	return(-1);
-}
-
-/* Function to update the state of a joystick - called as a device poll.
- * This function shouldn't update the joystick structure directly,
- * but instead should call SDL_PrivateJoystick*() to deliver events
- * and update joystick device state.
- */
-void SDL_SYS_JoystickUpdate(SDL_Joystick *joystick)
-{
-	return;
-}
-
-/* Function to close a joystick after use */
-void SDL_SYS_JoystickClose(SDL_Joystick *joystick)
-{
-	return;
-}
-
-/* Function to perform any system-specific joystick related cleanup */
-void SDL_SYS_JoystickQuit(void)
-{
-	return;
-}
-
-#endif /* SDL_JOYSTICK_DUMMY || SDL_JOYSTICK_DISABLED */
diff --git a/src/joystick/dummy/SDL_sysjoystick.o b/src/joystick/dummy/SDL_sysjoystick.o
deleted file mode 100644
index 57c1532..0000000
Binary files a/src/joystick/dummy/SDL_sysjoystick.o and /dev/null differ
diff --git a/src/loadso/dummy/SDL_sysloadso.o b/src/loadso/dummy/SDL_sysloadso.o
deleted file mode 100644
index 262035b..0000000
Binary files a/src/loadso/dummy/SDL_sysloadso.o and /dev/null differ
diff --git a/src/main/dummy/SDL_dummy_main.c b/src/main/dummy/SDL_dummy_main.c
deleted file mode 100644
index da47d06..0000000
--- a/src/main/dummy/SDL_dummy_main.c
+++ /dev/null
@@ -1,13 +0,0 @@
-
-/* Include the SDL main definition header */
-#include "SDL_main.h"
-
-#ifdef main
-#undef main
-int main(int argc, char *argv[])
-{
-	return(SDL_main(argc, argv));
-}
-#else
-/* Nothing to do on this platform */
-#endif
diff --git a/src/timer/dummy/SDL_systimer.c b/src/timer/dummy/SDL_systimer.c
deleted file mode 100644
index cc266bc..0000000
--- a/src/timer/dummy/SDL_systimer.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-#if defined(SDL_TIMER_DUMMY) || defined(SDL_TIMERS_DISABLED)
-
-#include "SDL_timer.h"
-#include "../SDL_timer_c.h"
-
-void SDL_StartTicks(void)
-{
-}
-
-Uint32 SDL_GetTicks (void)
-{
-	SDL_Unsupported();
-	return 0;
-}
-
-void SDL_Delay (Uint32 ms)
-{
-	SDL_Unsupported();
-}
-
-#include "SDL_thread.h"
-
-/* Data to handle a single periodic alarm */
-static int timer_alive = 0;
-static SDL_Thread *timer = NULL;
-
-static int RunTimer(void *unused)
-{
-	while ( timer_alive ) {
-		if ( SDL_timer_running ) {
-			SDL_ThreadedTimerCheck();
-		}
-		SDL_Delay(1);
-	}
-	return(0);
-}
-
-/* This is only called if the event thread is not running */
-int SDL_SYS_TimerInit(void)
-{
-	timer_alive = 1;
-	timer = SDL_CreateThread(RunTimer, NULL);
-	if ( timer == NULL )
-		return(-1);
-	return(SDL_SetTimerThreaded(1));
-}
-
-void SDL_SYS_TimerQuit(void)
-{
-	timer_alive = 0;
-	if ( timer ) {
-		SDL_WaitThread(timer, NULL);
-		timer = NULL;
-	}
-}
-
-int SDL_SYS_StartTimer(void)
-{
-	SDL_SetError("Internal logic error: threaded timer in use");
-	return(-1);
-}
-
-void SDL_SYS_StopTimer(void)
-{
-	return;
-}
-
-#endif /* SDL_TIMER_DUMMY || SDL_TIMERS_DISABLED */
diff --git a/src/timer/dummy/SDL_systimer.o b/src/timer/dummy/SDL_systimer.o
deleted file mode 100644
index 114d47e..0000000
Binary files a/src/timer/dummy/SDL_systimer.o and /dev/null differ
diff --git a/src/timer/prizm/SDL_systimer.c b/src/timer/prizm/SDL_systimer.c
index 476d8d9..4a6e67e 100644
--- a/src/timer/prizm/SDL_systimer.c
+++ b/src/timer/prizm/SDL_systimer.c
@@ -22,7 +22,7 @@
 
 #include <gint/gint.h>
 #include <gint/timer.h>
-#include <gint/rtc.h>
+#include <gint/mpu/tmu.h>
 
 #include "SDL_config.h"
 
@@ -30,32 +30,56 @@
 
 #include "SDL_timer.h"
 #include "../SDL_timer_c.h"
+
 
-static volatile unsigned *value;
-static volatile unsigned *control;
-Uint32 tick_sum = 0;
-Uint32 start = 0;
-
-/*
-Uses the first timer (0x900C0000)
-TC: http://hackspire.unsads.com/wiki/index.php/Memory-mapped_I/O_ports#900C0000_-_First_timer
-CX: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0271d/Babehiha.html
-*/
+int timerID = -1;
+uint32_t timerTick = 0;
+uint32_t timerStart = 0;
+uint32_t volatile *Tcnt = NULL;
+
+static int callback(void)
+{
+	return TIMER_CONTINUE;
+}
+
 
 void SDL_StartTicks(void)
 {
-
+    timerID = timer_configure( TIMER_ETMU, 0xFFFFFFFF, GINT_CALL(callback));
+    if (timerID!=-1)
+    {
+        timer_start(timerID);
+        Tcnt = &SH7305_ETMU[timerID-3].TCNT;
+        timerStart = *Tcnt;
+    }
 }
 
 Uint32 SDL_GetTicks (void)
 {
-
+
+    if (timerID!=-1)
+    {
+        timerTick = timerStart - *Tcnt;
+        return (timerTick >> 5);
+    }
+    else return 0xFFFFFFFF;
 }
 
 void SDL_Delay (Uint32 ms)
-{
+{
+    if (timerID!=-1)
+    {
+        uint32_t timerTempStart = SDL_GetTicks();
+        uint32_t currentTCNT;
 
+        do
+        {
+            currentTCNT = SDL_GetTicks() - timerTempStart;
+        }
+        while(currentTCNT <= ms);
+    }
 }
+
 
 #include "SDL_thread.h"
 
diff --git a/src/video/SDL_bmp.c b/src/video/SDL_bmp.c
index d56cfd8..6f6c1a6 100644
--- a/src/video/SDL_bmp.c
+++ b/src/video/SDL_bmp.c
@@ -21,14 +21,14 @@
 */
 #include "SDL_config.h"
 
-/* 
+/*
    Code to load and save surfaces in Windows BMP format.
 
    Why support BMP format?  Well, it's a native format for Windows, and
    most image processing programs can read and write it.  It would be nice
    to be able to have at least one image format that we can natively load
    and save, and since PNG is so complex that it would bloat the library,
-   BMP is a good alternative. 
+   BMP is a good alternative.
 
    This code currently supports Win32 DIBs in uncompressed 8 and 24 bpp.
 */
@@ -44,6 +44,61 @@
 #define BI_BITFIELDS	3
 #endif
 
+/*
+SDL_Surface * SDL_LoadBMP( const char* filename )
+{
+    return SDL_LoadBMP_RW(SDL_RWFromFile(filename, "rb"), 1);
+}
+*/
+
+SDL_Surface * SDL_LoadBMP( const char* filename )
+{
+    FILE* f = fopen(filename, "rb");
+
+    if(f == NULL)
+        return NULL;
+
+    unsigned char info[54];
+    fread(info, sizeof(unsigned char), 54, f); // read the 54-byte header
+
+    int32_t width = (int32_t)info[18] | (int32_t)info[19]<<8 | (int32_t)info[20]<<16 | (int32_t)info[21]<<24;
+    int32_t height = (int32_t)info[22] | (int32_t)info[23]<<8 | (int32_t)info[24]<<16 | (int32_t)info[25]<<24;
+
+    SDL_Surface *bmp = SDL_CreateRGBSurface( 0, width, height, 16, 0, 0, 0, 0 );
+
+    int row_padded = (width*3 + 3) & (~3);
+    unsigned char* data = (unsigned char*) malloc(row_padded);
+    char R,G,B;
+
+    //----------------------------
+    int32_t offsetdata = (int32_t)info[10] | (int32_t)info[11]<<8 | (int32_t)info[12]<<16 | (int32_t)info[13]<<24;
+    int delta;
+
+
+    if (offsetdata>54)
+    {
+        delta = offsetdata-54;
+        fread(data, sizeof(unsigned char), delta, f );
+    }
+
+    for(int i = 0; i < height; i++)
+    {
+        fread(data, sizeof(unsigned char), row_padded, f);
+        for(int j = 0; j < width*3; j += 3)
+        {
+            B=data[j];
+            G=data[j+1];
+            R=data[j+2];
+            uint16_t color = ((R & 0xf8) << 8) | ((G & 0xfc) << 3) | ((B & 0xf8) >> 3);
+            cSDL_SetPixel( bmp, j/3, (height-1)-i, color );
+        }
+    }
+    free(data);
+    fclose(f);
+
+    return bmp;
+}
+
 
 SDL_Surface * SDL_LoadBMP_RW (SDL_RWops *src, int freesrc)
 {
@@ -240,14 +295,14 @@ SDL_Surface * SDL_LoadBMP_RW (SDL_RWops *src, int freesrc)
 				SDL_RWread(src, &palette->colors[i].g, 1, 1);
 				SDL_RWread(src, &palette->colors[i].r, 1, 1);
 				palette->colors[i].unused = 0;
-			}	
+			}
 		} else {
 			for ( i = 0; i < (int)biClrUsed; ++i ) {
 				SDL_RWread(src, &palette->colors[i].b, 1, 1);
 				SDL_RWread(src, &palette->colors[i].g, 1, 1);
 				SDL_RWread(src, &palette->colors[i].r, 1, 1);
 				SDL_RWread(src, &palette->colors[i].unused, 1, 1);
-			}	
+			}
 		}
 		palette->ncolors = biClrUsed;
 	}
diff --git a/src/video/SDL_video.c b/src/video/SDL_video.c
index 46285c9..9958387 100644
--- a/src/video/SDL_video.c
+++ b/src/video/SDL_video.c
@@ -31,6 +31,8 @@
 #include "../events/SDL_sysevents.h"
 #include "../events/SDL_events_c.h"
 
+extern VideoBootStrap PRZ_bootstrap;
+
 /* Available video drivers */
 static VideoBootStrap *bootstrap[] = {
 #if SDL_VIDEO_DRIVER_QUARTZ
@@ -128,6 +130,9 @@ static VideoBootStrap *bootstrap[] = {
 #endif
 #if SDL_VIDEO_DRIVER_DUMMY
 	&DUMMY_bootstrap,
+#endif
+#if SDL_VIDEO_DRIVER_PRIZM
+	&PRZ_bootstrap,
 #endif
 	NULL
 };
@@ -217,7 +222,7 @@ int SDL_VideoInit (const char *driver_name, Uint32 flags)
 	video->offset_x = 0;
 	video->offset_y = 0;
 	SDL_memset(&video->info, 0, (sizeof video->info));
-	
+
 	video->displayformatalphapixel = NULL;
 
 	/* Set some very sane GL defaults */
@@ -240,7 +245,7 @@ int SDL_VideoInit (const char *driver_name, Uint32 flags)
 	video->gl_config.multisamplesamples = 0;
 	video->gl_config.accelerated = -1; /* not known, don't set */
 	video->gl_config.swap_control = -1; /* not known, don't set */
-	
+
 	/* Initialize the video subsystem */
 	SDL_memset(&vformat, 0, sizeof(vformat));
 	if ( video->VideoInit(video, &vformat) < 0 ) {
@@ -397,7 +402,7 @@ int SDL_VideoModeOK (int width, int height, int bpp, Uint32 flags)
 		if ( sizes == (SDL_Rect **)0 ) {
 			/* No sizes supported at this bit-depth */
 			continue;
-		} else 
+		} else
 		if (sizes == (SDL_Rect **)NEGATIVE_ONE) {
 			/* Any size supported at this bit-depth */
 			supported = 1;
@@ -788,7 +793,7 @@ SDL_Surface * SDL_SetVideoMode (int width, int height, int bpp, Uint32 flags)
 #endif /* __SDL_NOGETPROCADDR__ */
 
 #include "SDL_glfuncs.h"
-#undef SDL_PROC	
+#undef SDL_PROC
 	}
 #endif /* SDL_VIDEO_OPENGL */
 
@@ -817,9 +822,9 @@ SDL_Surface * SDL_SetVideoMode (int width, int height, int bpp, Uint32 flags)
 		   ) {
 			video->is_32bit = 0;
 			SDL_VideoSurface = SDL_CreateRGBSurface(
-				flags, 
-				width, 
-				height,  
+				flags,
+				width,
+				height,
 				16,
 				31 << 11,
 				63 << 5,
@@ -832,10 +837,10 @@ SDL_Surface * SDL_SetVideoMode (int width, int height, int bpp, Uint32 flags)
 		{
 			video->is_32bit = 1;
 			SDL_VideoSurface = SDL_CreateRGBSurface(
-				flags, 
-				width, 
-				height, 
-				32, 
+				flags,
+				width,
+				height,
+				32,
 #if SDL_BYTEORDER == SDL_LIL_ENDIAN
 				0x000000FF,
 				0x0000FF00,
@@ -893,7 +898,7 @@ SDL_Surface * SDL_SetVideoMode (int width, int height, int bpp, Uint32 flags)
 	     (
 	     (  !(flags&SDL_ANYFORMAT) &&
 			(SDL_VideoSurface->format->BitsPerPixel != bpp)) ||
-	     (   (flags&SDL_HWPALETTE) && 
+	     (   (flags&SDL_HWPALETTE) &&
 				!(SDL_VideoSurface->flags&SDL_HWPALETTE)) ||
 		/* If the surface is in hardware, video writes are visible
 		   as soon as they are performed, so we need to buffer them
@@ -921,7 +926,7 @@ SDL_Surface * SDL_SetVideoMode (int width, int height, int bpp, Uint32 flags)
 	return(SDL_PublicSurface);
 }
 
-/* 
+/*
  * Convert a surface into the video pixel format.
  */
 SDL_Surface * SDL_DisplayFormat (SDL_Surface *surface)
@@ -935,7 +940,7 @@ SDL_Surface * SDL_DisplayFormat (SDL_Surface *surface)
 	/* Set the flags appropriate for copying to display surface */
 	if (((SDL_PublicSurface->flags&SDL_HWSURFACE) == SDL_HWSURFACE) && current_video->info.blit_hw)
 		flags = SDL_HWSURFACE;
-	else 
+	else
 		flags = SDL_SWSURFACE;
 #ifdef AUTORLE_DISPLAYFORMAT
 	flags |= (surface->flags & (SDL_SRCCOLORKEY|SDL_SRCALPHA));
@@ -1063,14 +1068,14 @@ void SDL_UpdateRects (SDL_Surface *screen, int numrects, SDL_Rect *rects)
 			SDL_LockCursor();
 			SDL_DrawCursor(SDL_ShadowSurface);
 			for ( i=0; i<numrects; ++i ) {
-				SDL_LowerBlit(SDL_ShadowSurface, &rects[i], 
+				SDL_LowerBlit(SDL_ShadowSurface, &rects[i],
 						SDL_VideoSurface, &rects[i]);
 			}
 			SDL_EraseCursor(SDL_ShadowSurface);
 			SDL_UnlockCursor();
 		} else {
 			for ( i=0; i<numrects; ++i ) {
-				SDL_LowerBlit(SDL_ShadowSurface, &rects[i], 
+				SDL_LowerBlit(SDL_ShadowSurface, &rects[i],
 						SDL_VideoSurface, &rects[i]);
 			}
 		}
@@ -1576,42 +1581,42 @@ void SDL_GL_UpdateRects(int numrects, SDL_Rect *rects)
 
 				if ( update.h > 256 )
 					update.h = 256;
-			
+
 				this->glFlush();
-				this->glTexSubImage2D( 
-					GL_TEXTURE_2D, 
-					0, 
-					0, 
-					0, 
-					update.w, 
-					update.h, 
+				this->glTexSubImage2D(
+					GL_TEXTURE_2D,
+					0,
+					0,
+					0,
+					update.w,
+					update.h,
 					this->is_32bit? GL_RGBA : GL_RGB,
 #ifdef GL_VERSION_1_2
 					this->is_32bit ? GL_UNSIGNED_BYTE : GL_UNSIGNED_SHORT_5_6_5,
 #else
 					GL_UNSIGNED_BYTE,
 #endif
-					(Uint8 *)this->screen->pixels + 
-						this->screen->format->BytesPerPixel * update.x + 
+					(Uint8 *)this->screen->pixels +
+						this->screen->format->BytesPerPixel * update.x +
 						update.y * this->screen->pitch );
-	
+
 				this->glFlush();
 				/*
 				* Note the parens around the function name:
-				* This is because some OpenGL implementations define glTexCoord etc 
+				* This is because some OpenGL implementations define glTexCoord etc
 				* as macros, and we don't want them expanded here.
 				*/
 				this->glBegin(GL_TRIANGLE_STRIP);
-					(this->glTexCoord2f)( 0.0, 0.0 );	
+					(this->glTexCoord2f)( 0.0, 0.0 );
 					(this->glVertex2i)( update.x, update.y );
-					(this->glTexCoord2f)( (float)(update.w / 256.0), 0.0 );	
+					(this->glTexCoord2f)( (float)(update.w / 256.0), 0.0 );
 					(this->glVertex2i)( update.x + update.w, update.y );
 					(this->glTexCoord2f)( 0.0, (float)(update.h / 256.0) );
 					(this->glVertex2i)( update.x, update.y + update.h );
-					(this->glTexCoord2f)( (float)(update.w / 256.0), (float)(update.h / 256.0) );	
+					(this->glTexCoord2f)( (float)(update.w / 256.0), (float)(update.h / 256.0) );
 					(this->glVertex2i)( update.x + update.w	, update.y + update.h );
-				this->glEnd();	
-			
+				this->glEnd();
+
 				tmp.x += 256;
 				tmp.w -= 256;
 			}
@@ -1641,7 +1646,7 @@ void SDL_GL_Lock()
 		this->glDisable(GL_FOG);
 		this->glDisable(GL_ALPHA_TEST);
 		this->glDisable(GL_DEPTH_TEST);
-		this->glDisable(GL_SCISSOR_TEST);	
+		this->glDisable(GL_SCISSOR_TEST);
 		this->glDisable(GL_STENCIL_TEST);
 		this->glDisable(GL_CULL_FACE);
 
diff --git a/src/video/dummy/SDL_nullevents.c b/src/video/dummy/SDL_nullevents.c
deleted file mode 100644
index 177fc3f..0000000
--- a/src/video/dummy/SDL_nullevents.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/* Being a null driver, there's no event stream. We just define stubs for
-   most of the API. */
-
-#include "SDL.h"
-#include "../../events/SDL_sysevents.h"
-#include "../../events/SDL_events_c.h"
-
-#include "SDL_nullvideo.h"
-#include "SDL_nullevents_c.h"
-
-void DUMMY_PumpEvents(_THIS)
-{
-	/* do nothing. */
-}
-
-void DUMMY_InitOSKeymap(_THIS)
-{
-	/* do nothing. */
-}
-
-/* end of SDL_nullevents.c ... */
-
diff --git a/src/video/dummy/SDL_nullevents.o b/src/video/dummy/SDL_nullevents.o
deleted file mode 100644
index d0a0db5..0000000
Binary files a/src/video/dummy/SDL_nullevents.o and /dev/null differ
diff --git a/src/video/dummy/SDL_nullevents_c.h b/src/video/dummy/SDL_nullevents_c.h
deleted file mode 100644
index 3b65794..0000000
--- a/src/video/dummy/SDL_nullevents_c.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-#include "SDL_nullvideo.h"
-
-/* Variables and functions exported by SDL_sysevents.c to other parts 
-   of the native video subsystem (SDL_sysvideo.c)
-*/
-extern void DUMMY_InitOSKeymap(_THIS);
-extern void DUMMY_PumpEvents(_THIS);
-
-/* end of SDL_nullevents_c.h ... */
-
diff --git a/src/video/dummy/SDL_nullmouse.c b/src/video/dummy/SDL_nullmouse.c
deleted file mode 100644
index 47daea8..0000000
--- a/src/video/dummy/SDL_nullmouse.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-#include "SDL_mouse.h"
-#include "../../events/SDL_events_c.h"
-
-#include "SDL_nullmouse_c.h"
-
-
-/* The implementation dependent data for the window manager cursor */
-struct WMcursor {
-	int unused;
-};
diff --git a/src/video/dummy/SDL_nullmouse.o b/src/video/dummy/SDL_nullmouse.o
deleted file mode 100644
index c1fb0ce..0000000
Binary files a/src/video/dummy/SDL_nullmouse.o and /dev/null differ
diff --git a/src/video/dummy/SDL_nullmouse_c.h b/src/video/dummy/SDL_nullmouse_c.h
deleted file mode 100644
index 479eb0e..0000000
--- a/src/video/dummy/SDL_nullmouse_c.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-#include "SDL_nullvideo.h"
-
-/* Functions to be exported */
diff --git a/src/video/dummy/SDL_nullvideo.c b/src/video/dummy/SDL_nullvideo.c
deleted file mode 100644
index 7e096e2..0000000
--- a/src/video/dummy/SDL_nullvideo.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-/* Dummy SDL video driver implementation; this is just enough to make an
- *  SDL-based application THINK it's got a working video driver, for
- *  applications that call SDL_Init(SDL_INIT_VIDEO) when they don't need it,
- *  and also for use as a collection of stubs when porting SDL to a new
- *  platform for which you haven't yet written a valid video driver.
- *
- * This is also a great way to determine bottlenecks: if you think that SDL
- *  is a performance problem for a given platform, enable this driver, and
- *  then see if your application runs faster without video overhead.
- *
- * Initial work by Ryan C. Gordon (icculus@icculus.org). A good portion
- *  of this was cut-and-pasted from Stephane Peter's work in the AAlib
- *  SDL video driver.  Renamed to "DUMMY" by Sam Lantinga.
- */
-
-#include "SDL_video.h"
-#include "SDL_mouse.h"
-#include "../SDL_sysvideo.h"
-#include "../SDL_pixels_c.h"
-#include "../../events/SDL_events_c.h"
-
-#include "SDL_nullvideo.h"
-#include "SDL_nullevents_c.h"
-#include "SDL_nullmouse_c.h"
-
-#define DUMMYVID_DRIVER_NAME "dummy"
-
-/* Initialization/Query functions */
-static int DUMMY_VideoInit(_THIS, SDL_PixelFormat *vformat);
-static SDL_Rect **DUMMY_ListModes(_THIS, SDL_PixelFormat *format, Uint32 flags);
-static SDL_Surface *DUMMY_SetVideoMode(_THIS, SDL_Surface *current, int width, int height, int bpp, Uint32 flags);
-static int DUMMY_SetColors(_THIS, int firstcolor, int ncolors, SDL_Color *colors);
-static void DUMMY_VideoQuit(_THIS);
-
-/* Hardware surface functions */
-static int DUMMY_AllocHWSurface(_THIS, SDL_Surface *surface);
-static int DUMMY_LockHWSurface(_THIS, SDL_Surface *surface);
-static void DUMMY_UnlockHWSurface(_THIS, SDL_Surface *surface);
-static void DUMMY_FreeHWSurface(_THIS, SDL_Surface *surface);
-
-/* etc. */
-static void DUMMY_UpdateRects(_THIS, int numrects, SDL_Rect *rects);
-
-/* DUMMY driver bootstrap functions */
-
-static int DUMMY_Available(void)
-{
-	const char *envr = SDL_getenv("SDL_VIDEODRIVER");
-	if ((envr) && (SDL_strcmp(envr, DUMMYVID_DRIVER_NAME) == 0)) {
-		return(1);
-	}
-
-	return(0);
-}
-
-static void DUMMY_DeleteDevice(SDL_VideoDevice *device)
-{
-	SDL_free(device->hidden);
-	SDL_free(device);
-}
-
-static SDL_VideoDevice *DUMMY_CreateDevice(int devindex)
-{
-	SDL_VideoDevice *device;
-
-	/* Initialize all variables that we clean on shutdown */
-	device = (SDL_VideoDevice *)SDL_malloc(sizeof(SDL_VideoDevice));
-	if ( device ) {
-		SDL_memset(device, 0, (sizeof *device));
-		device->hidden = (struct SDL_PrivateVideoData *)
-				SDL_malloc((sizeof *device->hidden));
-	}
-	if ( (device == NULL) || (device->hidden == NULL) ) {
-		SDL_OutOfMemory();
-		if ( device ) {
-			SDL_free(device);
-		}
-		return(0);
-	}
-	SDL_memset(device->hidden, 0, (sizeof *device->hidden));
-
-	/* Set the function pointers */
-	device->VideoInit = DUMMY_VideoInit;
-	device->ListModes = DUMMY_ListModes;
-	device->SetVideoMode = DUMMY_SetVideoMode;
-	device->CreateYUVOverlay = NULL;
-	device->SetColors = DUMMY_SetColors;
-	device->UpdateRects = DUMMY_UpdateRects;
-	device->VideoQuit = DUMMY_VideoQuit;
-	device->AllocHWSurface = DUMMY_AllocHWSurface;
-	device->CheckHWBlit = NULL;
-	device->FillHWRect = NULL;
-	device->SetHWColorKey = NULL;
-	device->SetHWAlpha = NULL;
-	device->LockHWSurface = DUMMY_LockHWSurface;
-	device->UnlockHWSurface = DUMMY_UnlockHWSurface;
-	device->FlipHWSurface = NULL;
-	device->FreeHWSurface = DUMMY_FreeHWSurface;
-	device->SetCaption = NULL;
-	device->SetIcon = NULL;
-	device->IconifyWindow = NULL;
-	device->GrabInput = NULL;
-	device->GetWMInfo = NULL;
-	device->InitOSKeymap = DUMMY_InitOSKeymap;
-	device->PumpEvents = DUMMY_PumpEvents;
-
-	device->free = DUMMY_DeleteDevice;
-
-	return device;
-}
-
-VideoBootStrap DUMMY_bootstrap = {
-	DUMMYVID_DRIVER_NAME, "SDL dummy video driver",
-	DUMMY_Available, DUMMY_CreateDevice
-};
-
-
-int DUMMY_VideoInit(_THIS, SDL_PixelFormat *vformat)
-{
-	/*
-	fprintf(stderr, "WARNING: You are using the SDL dummy video driver!\n");
-	*/
-
-	/* Determine the screen depth (use default 8-bit depth) */
-	/* we change this during the SDL_SetVideoMode implementation... */
-	vformat->BitsPerPixel = 8;
-	vformat->BytesPerPixel = 1;
-
-	/* We're done! */
-	return(0);
-}
-
-SDL_Rect **DUMMY_ListModes(_THIS, SDL_PixelFormat *format, Uint32 flags)
-{
-   	 return (SDL_Rect **) -1;
-}
-
-SDL_Surface *DUMMY_SetVideoMode(_THIS, SDL_Surface *current,
-				int width, int height, int bpp, Uint32 flags)
-{
-	if ( this->hidden->buffer ) {
-		SDL_free( this->hidden->buffer );
-	}
-
-	this->hidden->buffer = SDL_malloc(width * height * (bpp / 8));
-	if ( ! this->hidden->buffer ) {
-		SDL_SetError("Couldn't allocate buffer for requested mode");
-		return(NULL);
-	}
-
-/* 	printf("Setting mode %dx%d\n", width, height); */
-
-	SDL_memset(this->hidden->buffer, 0, width * height * (bpp / 8));
-
-	/* Allocate the new pixel format for the screen */
-	if ( ! SDL_ReallocFormat(current, bpp, 0, 0, 0, 0) ) {
-		SDL_free(this->hidden->buffer);
-		this->hidden->buffer = NULL;
-		SDL_SetError("Couldn't allocate new pixel format for requested mode");
-		return(NULL);
-	}
-
-	/* Set up the new mode framebuffer */
-	current->flags = flags & SDL_FULLSCREEN;
-	this->hidden->w = current->w = width;
-	this->hidden->h = current->h = height;
-	current->pitch = current->w * (bpp / 8);
-	current->pixels = this->hidden->buffer;
-
-	/* We're done */
-	return(current);
-}
-
-/* We don't actually allow hardware surfaces other than the main one */
-static int DUMMY_AllocHWSurface(_THIS, SDL_Surface *surface)
-{
-	return(-1);
-}
-static void DUMMY_FreeHWSurface(_THIS, SDL_Surface *surface)
-{
-	return;
-}
-
-/* We need to wait for vertical retrace on page flipped displays */
-static int DUMMY_LockHWSurface(_THIS, SDL_Surface *surface)
-{
-	return(0);
-}
-
-static void DUMMY_UnlockHWSurface(_THIS, SDL_Surface *surface)
-{
-	return;
-}
-
-static void DUMMY_UpdateRects(_THIS, int numrects, SDL_Rect *rects)
-{
-	/* do nothing. */
-}
-
-int DUMMY_SetColors(_THIS, int firstcolor, int ncolors, SDL_Color *colors)
-{
-	/* do nothing of note. */
-	return(1);
-}
-
-/* Note:  If we are terminated, this could be called in the middle of
-   another SDL video routine -- notably UpdateRects.
-*/
-void DUMMY_VideoQuit(_THIS)
-{
-	if (this->screen->pixels != NULL)
-	{
-		SDL_free(this->screen->pixels);
-		this->screen->pixels = NULL;
-	}
-}
diff --git a/src/video/dummy/SDL_nullvideo.h b/src/video/dummy/SDL_nullvideo.h
deleted file mode 100644
index 05c19e3..0000000
--- a/src/video/dummy/SDL_nullvideo.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-    SDL - Simple DirectMedia Layer
-    Copyright (C) 1997-2012 Sam Lantinga
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-
-    Sam Lantinga
-    slouken@libsdl.org
-*/
-#include "SDL_config.h"
-
-#ifndef _SDL_nullvideo_h
-#define _SDL_nullvideo_h
-
-#include "../SDL_sysvideo.h"
-
-/* Hidden "this" pointer for the video functions */
-#define _THIS	SDL_VideoDevice *this
-
-
-/* Private display data */
-
-struct SDL_PrivateVideoData {
-    int w, h;
-    void *buffer;
-};
-
-#endif /* _SDL_nullvideo_h */
diff --git a/src/video/dummy/SDL_nullvideo.o b/src/video/dummy/SDL_nullvideo.o
deleted file mode 100644
index d7f4912..0000000
Binary files a/src/video/dummy/SDL_nullvideo.o and /dev/null differ
diff --git a/src/video/prizm/SDL_prizmevents.c b/src/video/prizm/SDL_prizmevents.c
index 101a4c8..6004645 100644
--- a/src/video/prizm/SDL_prizmevents.c
+++ b/src/video/prizm/SDL_prizmevents.c
@@ -31,42 +31,51 @@
 #include "SDL_prizmvideo.h"
 #include "SDL_prizmevents_c.h"
 
-static int nspk_keymap[PRZ_NUMKEYS];
+static int przk_keymap[PRZ_NUMKEYS];
 static SDLKey sdlk_keymap[PRZ_NUMKEYS];
 static Uint8 key_state[PRZ_NUMKEYS];
 
 static SDLKey sdlak_keymap[4] = {SDLK_UP, SDLK_RIGHT, SDLK_DOWN, SDLK_LEFT};
 static Uint8 arrow_key_state[4];
+
+
+bool keystatus( int keycode )
+{
+    if (keydown( keycode) !=0 ) return true;
+    else return false;
+}
+
+
+static void PRZ_update_keyboard(void)
+{
+    clearevents();
+
+    bool key_pressed[PRZ_NUMKEYS];
+
+    for ( int i = 0; i < PRZ_NUMKEYS; ++i )
+    {
+        key_pressed[i] = keystatus( przk_keymap[i] );
+
+		if ( sdlk_keymap[i] != SDLK_UNKNOWN )
+            PRZ_UPDATE_KEY_EVENT(sdlk_keymap[i], i, key_state[i], key_pressed[i]);
+	}
 
-static void nsp_update_keyboard(void)
-{
-	int i;
-	for ( i = 0; i < PRZ_NUMKEYS; ++i ) {
-		bool key_pressed;
-		if ( sdlk_keymap[i] == SDLK_UNKNOWN )
-			continue;
-		key_pressed = keydown(nspk_keymap[i]);
-		PRZ_UPDATE_KEY_EVENT(sdlk_keymap[i], i, key_state[i], key_pressed);
-	}
 }
-
-static void nsp_update_arrow_keys(void)
+
+static void PRZ_update_arrow_keys(void)
 {
-	bool arrow_key_pressed[4] = {
-		keydown(KEY_UP),
-		keydown(KEY_RIGHT),
-		keydown(KEY_DOWN),
-		keydown(KEY_LEFT)
-	};
-	int i;
-	for ( i = 0; i < 4; ++i )
+    clearevents();
+
+	bool arrow_key_pressed[4] = { keystatus(KEY_UP), keystatus(KEY_RIGHT), keystatus(KEY_DOWN), keystatus(KEY_LEFT) };
+
+	for ( int i = 0; i < 4; ++i )
 		PRZ_UPDATE_KEY_EVENT(sdlak_keymap[i], i, arrow_key_state[i], arrow_key_pressed[i]);
 }
 
 void PRZ_PumpEvents(_THIS)
 {
-	nsp_update_keyboard();
-	nsp_update_arrow_keys();
+	PRZ_update_keyboard();
+	//PRZ_update_arrow_keys();
 }
 
 void PRZ_InitOSKeymap(_THIS)
@@ -74,26 +83,163 @@ void PRZ_InitOSKeymap(_THIS)
 	/* Enum value -> KEY_NSPIRE_* */
 
 
-	//nspk_keymap[NSP_KEY_RET] =	KEY_NSPIRE_RET;
-
-    /*
-    **
-    **
-    **
-    **
-    */
+przk_keymap[ PRZ_KEY_F1	]=	 KEY_F1	;
+przk_keymap[ PRZ_KEY_F2	]=	 KEY_F2	;
+przk_keymap[ PRZ_KEY_F3	]=	 KEY_F3	;
+przk_keymap[ PRZ_KEY_F4	]=	 KEY_F4	;
+przk_keymap[ PRZ_KEY_F5	]=	 KEY_F5	;
+przk_keymap[ PRZ_KEY_F6	]=	 KEY_F6	;
+przk_keymap[ PRZ_KEY_SHIFT ]= KEY_SHIFT	;
+przk_keymap[ PRZ_KEY_OPTN ]=	 KEY_OPTN	;
+przk_keymap[ PRZ_KEY_VARS ]=	 KEY_VARS	;
+przk_keymap[ PRZ_KEY_MENU ]=	 KEY_MENU	;
+przk_keymap[ PRZ_KEY_LEFT	]=	 KEY_LEFT	;
+przk_keymap[ PRZ_KEY_UP	]=	     KEY_UP	;
+przk_keymap[ PRZ_KEY_ALPHA	]=	 KEY_ALPHA	;
+przk_keymap[ PRZ_KEY_SQUARE	]=	 KEY_SQUARE	;
+przk_keymap[ PRZ_KEY_POWER	]=	 KEY_POWER	;
+przk_keymap[ PRZ_KEY_EXIT	]=	 KEY_EXIT	;
+przk_keymap[ PRZ_KEY_DOWN	]=	 KEY_DOWN	;
+przk_keymap[ PRZ_KEY_RIGHT	]=	 KEY_RIGHT	;
+przk_keymap[ PRZ_KEY_XOT	]=	 KEY_XOT	;
+przk_keymap[ PRZ_KEY_LOG	]=	 KEY_LOG	;
+przk_keymap[ PRZ_KEY_LN	]=	     KEY_LN	;
+przk_keymap[ PRZ_KEY_SIN	]=	 KEY_SIN	;
+przk_keymap[ PRZ_KEY_COS	]=	 KEY_COS	;
+przk_keymap[ PRZ_KEY_TAN	]=	 KEY_TAN	;
+przk_keymap[ PRZ_KEY_FRAC	]=	 KEY_FRAC	;
+przk_keymap[ PRZ_KEY_FD	]=	     KEY_FD	;
+przk_keymap[ PRZ_KEY_LEFTP	]=	 KEY_LEFTP	;
+przk_keymap[ PRZ_KEY_RIGHTP	]=	 KEY_RIGHTP	;
+przk_keymap[ PRZ_KEY_COMMA	]=	 KEY_COMMA	;
+przk_keymap[ PRZ_KEY_ARROW	]=	 KEY_ARROW	;
+przk_keymap[ PRZ_KEY_7	]=	 KEY_7	;
+przk_keymap[ PRZ_KEY_8	]=	 KEY_8	;
+przk_keymap[ PRZ_KEY_9	]=	 KEY_9	;
+przk_keymap[ PRZ_KEY_DEL	]=	 KEY_DEL	;
+przk_keymap[ PRZ_KEY_4	]=	 KEY_4	;
+przk_keymap[ PRZ_KEY_5	]=	 KEY_5	;
+przk_keymap[ PRZ_KEY_6	]=	 KEY_6	;
+przk_keymap[ PRZ_KEY_MUL	]=	 KEY_MUL	;
+przk_keymap[ PRZ_KEY_DIV	]=	 KEY_DIV	;
+przk_keymap[ PRZ_KEY_1	]=	 KEY_1	;
+przk_keymap[ PRZ_KEY_2	]=	 KEY_2	;
+przk_keymap[ PRZ_KEY_3	]=	 KEY_3	;
+przk_keymap[ PRZ_KEY_ADD	]=	 KEY_ADD	;
+przk_keymap[ PRZ_KEY_SUB	]=	 KEY_SUB	;
+przk_keymap[ PRZ_KEY_0	]=	 KEY_0	;
+przk_keymap[ PRZ_KEY_DOT	]=	 KEY_DOT	;
+przk_keymap[ PRZ_KEY_EXP	]=	 KEY_EXP	;
+przk_keymap[ PRZ_KEY_NEG	]=	 KEY_NEG	;
+przk_keymap[ PRZ_KEY_EXE	]=	 KEY_EXE	;
+przk_keymap[ PRZ_KEY_ACON	]=	 KEY_ACON	;
+
 
 	/* Enum value -> SDLK_*
 	   This is the actual key mapping part. */
+/*sdlk_keymap[ PRZ_KEY_F1	]=	SDLK_F1	;
+sdlk_keymap[ PRZ_KEY_F2	]=	SDLK_F2	;
+sdlk_keymap[ PRZ_KEY_F3	]=	SDLK_F3	;
+sdlk_keymap[ PRZ_KEY_F4	]=	SDLK_F4	;
+sdlk_keymap[ PRZ_KEY_F5	]=	SDLK_F5	;
+sdlk_keymap[ PRZ_KEY_F6	]=	SDLK_F6	;
+sdlk_keymap[ PRZ_KEY_SHIFT	]=	SDLK_LSHIFT	;
+sdlk_keymap[ PRZ_KEY_OPTN	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_VARS	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_MENU	]=	SDLK_MENU	;
+sdlk_keymap[ PRZ_KEY_LEFT	]=	SDLK_LEFT	;
+sdlk_keymap[ PRZ_KEY_UP	]=	SDLK_UP	;
+sdlk_keymap[ PRZ_KEY_ALPHA	]=	SDLK_CAPSLOCK	;
+sdlk_keymap[ PRZ_KEY_SQUARE	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_POWER	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_EXIT	]=	SDLK_ESCAPE	;
+sdlk_keymap[ PRZ_KEY_DOWN	]=	SDLK_DOWN	;
+sdlk_keymap[ PRZ_KEY_RIGHT	]=	SDLK_RIGHT	;
+sdlk_keymap[ PRZ_KEY_XOT	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_LOG	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_LN	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_SIN	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_COS	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_TAN	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_FRAC	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_FD	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_LEFTP	]=	SDLK_LEFTPAREN	;
+sdlk_keymap[ PRZ_KEY_RIGHTP	]=	SDLK_RIGHTPAREN	;
+sdlk_keymap[ PRZ_KEY_COMMA	]=	SDLK_COMMA	;
+sdlk_keymap[ PRZ_KEY_ARROW	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_7	]=	SDLK_KP7	;
+sdlk_keymap[ PRZ_KEY_8	]=	SDLK_KP8	;
+sdlk_keymap[ PRZ_KEY_9	]=	SDLK_KP9	;
+sdlk_keymap[ PRZ_KEY_DEL	]=	SDLK_DELETE	;
+sdlk_keymap[ PRZ_KEY_4	]=	SDLK_KP4	;
+sdlk_keymap[ PRZ_KEY_5	]=	SDLK_KP5	;
+sdlk_keymap[ PRZ_KEY_6	]=	SDLK_KP6	;
+sdlk_keymap[ PRZ_KEY_MUL	]=	SDLK_KP_MULTIPLY	;
+sdlk_keymap[ PRZ_KEY_DIV	]=	SDLK_KP_DIVIDE	;
+sdlk_keymap[ PRZ_KEY_1	]=	SDLK_KP1	;
+sdlk_keymap[ PRZ_KEY_2	]=	SDLK_KP2	;
+sdlk_keymap[ PRZ_KEY_3	]=	SDLK_KP3	;
+sdlk_keymap[ PRZ_KEY_ADD	]=	SDLK_KP_PLUS	;
+sdlk_keymap[ PRZ_KEY_SUB	]=	SDLK_KP_MINUS	;
+sdlk_keymap[ PRZ_KEY_0	]=	SDLK_KP0	;
+sdlk_keymap[ PRZ_KEY_DOT	]=	SDLK_KP_PERIOD	;
+sdlk_keymap[ PRZ_KEY_EXP	]=	SDLK_UNKNOWN	;
+sdlk_keymap[ PRZ_KEY_NEG	]=	SDLK_MINUS	;
+sdlk_keymap[ PRZ_KEY_EXE	]=	SDLK_RETURN	;
+sdlk_keymap[ PRZ_KEY_ACON	]=	SDLK_POWER	;
+*/
 
+sdlk_keymap[	PRZ_KEY_F1	]=SDLK_PRZ_KEY_F1	;
+sdlk_keymap[	 PRZ_KEY_F2	]=SDLK_PRZ_KEY_F2	;
+sdlk_keymap[	 PRZ_KEY_F3	]=SDLK_PRZ_KEY_F3	;
+sdlk_keymap[	 PRZ_KEY_F4	]=SDLK_PRZ_KEY_F4	;
+sdlk_keymap[	 PRZ_KEY_F5	]=SDLK_PRZ_KEY_F5	;
+sdlk_keymap[	 PRZ_KEY_F6	]=SDLK_PRZ_KEY_F6	;
+sdlk_keymap[	 PRZ_KEY_SHIFT	]=SDLK_PRZ_KEY_SHIFT	;
+sdlk_keymap[	PRZ_KEY_OPTN	]=SDLK_PRZ_KEY_OPTN	;
+sdlk_keymap[	PRZ_KEY_VARS	]=SDLK_PRZ_KEY_VARS	;
+sdlk_keymap[	PRZ_KEY_MENU	]=SDLK_PRZ_KEY_MENU	;
+sdlk_keymap[	 PRZ_KEY_LEFT	]=SDLK_PRZ_KEY_LEFT	;
+sdlk_keymap[	 PRZ_KEY_UP	]=SDLK_PRZ_KEY_UP	;
+sdlk_keymap[	 PRZ_KEY_ALPHA	]=SDLK_PRZ_KEY_ALPHA	;
+sdlk_keymap[	 PRZ_KEY_SQUARE	]=SDLK_PRZ_KEY_SQUARE	;
+sdlk_keymap[	 PRZ_KEY_POWER	]=SDLK_PRZ_KEY_POWER	;
+sdlk_keymap[	 PRZ_KEY_EXIT	]=SDLK_PRZ_KEY_EXIT	;
+sdlk_keymap[	 PRZ_KEY_DOWN	]=SDLK_PRZ_KEY_DOWN	;
+sdlk_keymap[	 PRZ_KEY_RIGHT	]=SDLK_PRZ_KEY_RIGHT	;
+sdlk_keymap[	 PRZ_KEY_XOT	]=SDLK_PRZ_KEY_XOT	;
+sdlk_keymap[	 PRZ_KEY_LOG	]=SDLK_PRZ_KEY_LOG	;
+sdlk_keymap[	 PRZ_KEY_LN	]=SDLK_PRZ_KEY_LN	;
+sdlk_keymap[	 PRZ_KEY_SIN	]=SDLK_PRZ_KEY_SIN	;
+sdlk_keymap[	 PRZ_KEY_COS	]=SDLK_PRZ_KEY_COS	;
+sdlk_keymap[	 PRZ_KEY_TAN	]=SDLK_PRZ_KEY_TAN	;
+sdlk_keymap[	 PRZ_KEY_FRAC	]=SDLK_PRZ_KEY_FRAC	;
+sdlk_keymap[	 PRZ_KEY_FD	]=SDLK_PRZ_KEY_FD	;
+sdlk_keymap[	 PRZ_KEY_LEFTP	]=SDLK_PRZ_KEY_LEFTP	;
+sdlk_keymap[	 PRZ_KEY_RIGHTP	]=SDLK_PRZ_KEY_RIGHTP	;
+sdlk_keymap[	 PRZ_KEY_COMMA	]=SDLK_PRZ_KEY_COMMA	;
+sdlk_keymap[	 PRZ_KEY_ARROW	]=SDLK_PRZ_KEY_ARROW	;
+sdlk_keymap[	 PRZ_KEY_7	]=SDLK_PRZ_KEY_7	;
+sdlk_keymap[	 PRZ_KEY_8	]=SDLK_PRZ_KEY_8	;
+sdlk_keymap[	 PRZ_KEY_9	]=SDLK_PRZ_KEY_9	;
+sdlk_keymap[	 PRZ_KEY_DEL	]=SDLK_PRZ_KEY_DEL	;
+sdlk_keymap[	 PRZ_KEY_4	]=SDLK_PRZ_KEY_4	;
+sdlk_keymap[	 PRZ_KEY_5	]=SDLK_PRZ_KEY_5	;
+sdlk_keymap[	 PRZ_KEY_6	]=SDLK_PRZ_KEY_6	;
+sdlk_keymap[	 PRZ_KEY_MUL	]=SDLK_PRZ_KEY_MUL	;
+sdlk_keymap[	 PRZ_KEY_DIV	]=SDLK_PRZ_KEY_DIV	;
+sdlk_keymap[	 PRZ_KEY_1	]=SDLK_PRZ_KEY_1	;
+sdlk_keymap[	 PRZ_KEY_2	]=SDLK_PRZ_KEY_2	;
+sdlk_keymap[	 PRZ_KEY_3	]=SDLK_PRZ_KEY_3	;
+sdlk_keymap[	 PRZ_KEY_ADD	]=SDLK_PRZ_KEY_ADD	;
+sdlk_keymap[	 PRZ_KEY_SUB	]=SDLK_PRZ_KEY_SUB	;
+sdlk_keymap[	 PRZ_KEY_0	]=SDLK_PRZ_KEY_0	;
+sdlk_keymap[	 PRZ_KEY_DOT	]=SDLK_PRZ_KEY_DOT	;
+sdlk_keymap[	 PRZ_KEY_EXP	]=SDLK_PRZ_KEY_EXP	;
+sdlk_keymap[	 PRZ_KEY_NEG	]=SDLK_PRZ_KEY_NEG	;
+sdlk_keymap[	 PRZ_KEY_EXE	]=SDLK_PRZ_KEY_EXE	;
+sdlk_keymap[	 PRZ_KEY_ACON	]=SDLK_PRZ_KEY_ACON	;
 
-	//	sdlk_keymap[NSP_KEY_A] =	SDLK_a;
-    /*
-    **
-    **
-    **
-    **
-    */
 }
 
 /* end of SDL_tinspireevents.c ... */
diff --git a/src/video/prizm/SDL_prizmevents_c.h b/src/video/prizm/SDL_prizmevents_c.h
index ad75330..81bd05d 100644
--- a/src/video/prizm/SDL_prizmevents_c.h
+++ b/src/video/prizm/SDL_prizmevents_c.h
@@ -90,6 +90,10 @@ enum {
 	PRZ_KEY_NEG,
 	PRZ_KEY_EXE,
 	PRZ_KEY_ACON,
+//	PRZ_KEY_LEFTUP,
+//	PRZ_KEY_LEFTDOWN,
+//	PRZ_KEY_RIGHTUP,
+//	PRZ_KEY_RIGHTDOWN,
 	PRZ_NUMKEYS
 };
 
diff --git a/src/video/prizm/SDL_prizmfonts.c b/src/video/prizm/SDL_prizmfonts.c
index 430498c..aeb6af5 100644
--- a/src/video/prizm/SDL_prizmfonts.c
+++ b/src/video/prizm/SDL_prizmfonts.c
@@ -3,9 +3,9 @@
 #include "SDL_prizmvideo.h"
 #include "SDL_prizmfonts.h"
 
-nSDL_Font *nSDL_LoadFont(int font_index, Uint8 r, Uint8 g, Uint8 b)
+cSDL_Font *cSDL_LoadFont(int font_index, Uint8 r, Uint8 g, Uint8 b)
 {
-	nSDL_Font *font;
+	cSDL_Font *font;
 	int i, j, k;
 
 	font = SDL_malloc(sizeof(*font));
@@ -14,11 +14,11 @@ nSDL_Font *nSDL_LoadFont(int font_index, Uint8 r, Uint8 g, Uint8 b)
 		return(NULL);
 	}
 
-	for ( i = 0; i < NSP_FONT_NUMCHARS; ++i ) {
+	for ( i = 0; i < PRZ_FONT_NUMCHARS; ++i ) {
 		int offset = 8 * i;
 		int max_width = 0;
 		Uint32 color;
-		SDL_Surface *tmp = SDL_CreateRGBSurface(SDL_SWSURFACE, NSP_FONT_WIDTH, NSP_FONT_HEIGHT,
+		SDL_Surface *tmp = SDL_CreateRGBSurface(SDL_SWSURFACE, PRZ_FONT_WIDTH, PRZ_FONT_HEIGHT,
 							16, PRZ_RMASK16, PRZ_GMASK16, PRZ_BMASK16, 0);
 		if ( tmp == NULL ) {
 			SDL_OutOfMemory();
@@ -27,16 +27,16 @@ nSDL_Font *nSDL_LoadFont(int font_index, Uint8 r, Uint8 g, Uint8 b)
 		color = SDL_MapRGB(tmp->format, r, g, b);
 		SDL_FillRect(tmp, NULL, ! color);
 		SDL_SetColorKey(tmp, SDL_SRCCOLORKEY, ! color);
-		font->char_width[i] = NSP_FONT_WIDTH;
+		font->char_width[i] = PRZ_FONT_WIDTH;
 		SDL_LockSurface(tmp);
-		for ( j = 0; j < NSP_FONT_HEIGHT; ++j )
-			for ( k = 0; k < NSP_FONT_WIDTH; ++k ) {
-				if ( nsp_font_charmaps[font_index][offset + j] & (1 << (NSP_FONT_WIDTH - k - 1)) ) { /* "Pixel" set */
+		for ( j = 0; j < PRZ_FONT_HEIGHT; ++j )
+			for ( k = 0; k < PRZ_FONT_WIDTH; ++k ) {
+				if ( prz_font_charmaps[font_index][offset + j] & (1 << (PRZ_FONT_WIDTH - k - 1)) ) { /* "Pixel" set */
 					if ( k > max_width ) {
 						font->char_width[i] = k + 1;
 						max_width = k;
 					}
-					nSDL_SetPixel(tmp, k, j, color);
+					cSDL_SetPixel(tmp, k, j, color);
 				}
 			}
 		SDL_UnlockSurface(tmp);
@@ -49,28 +49,28 @@ nSDL_Font *nSDL_LoadFont(int font_index, Uint8 r, Uint8 g, Uint8 b)
 	return(font);
 }
 
-void nSDL_SetFontSpacing(nSDL_Font *font, int hspacing, int vspacing)
+void cSDL_SetFontSpacing(cSDL_Font *font, int hspacing, int vspacing)
 {
 	font->hspacing = hspacing;
 	font->vspacing = vspacing;
 }
 
-void nSDL_EnableFontMonospaced(nSDL_Font *font, SDL_bool toggle)
+void cSDL_EnableFontMonospaced(cSDL_Font *font, SDL_bool toggle)
 {
 	font->monospaced = toggle;
 }
 
-void nSDL_FreeFont(nSDL_Font *font)
+void cSDL_FreeFont(cSDL_Font *font)
 {
 	int i;
 	if ( font == NULL )
 		return;
-	for ( i = 0; i < NSP_FONT_NUMCHARS; ++i )
+	for ( i = 0; i < PRZ_FONT_NUMCHARS; ++i )
 		SDL_FreeSurface(font->chars[i]);
 	SDL_free(font);
 }
 
-int nSDL_DrawString(SDL_Surface *surface, nSDL_Font *font,
+int cSDL_DrawString(SDL_Surface *surface, cSDL_Font *font,
 		    int x, int y, const char *format, ...)
 {
 	char buf[PRZ_BUF_SIZE];
@@ -92,22 +92,22 @@ int nSDL_DrawString(SDL_Surface *surface, nSDL_Font *font,
 		int c = buf[i];
 		if ( c == '\n' ) {
 			pos.x = x;
-			pos.y += NSP_FONT_HEIGHT + font->vspacing;
+			pos.y += PRZ_FONT_HEIGHT + font->vspacing;
 		} else {
 			SDL_Rect rect;
 			rect.x = rect.y = 0;
 			rect.w = font->char_width[c];
-			rect.h = NSP_FONT_HEIGHT;
+			rect.h = PRZ_FONT_HEIGHT;
 			if ( SDL_BlitSurface(font->chars[c], &rect, surface, &pos) == -1 )
 				return(-1);
-			pos.x += NSP_CHAR_WIDTH(font, c) + font->hspacing;
+			pos.x += PRZ_CHAR_WIDTH(font, c) + font->hspacing;
 		}
 	}
 
 	return(0);
 }
 
-int nSDL_GetStringWidth(nSDL_Font *font, const char *s)
+int cSDL_GetStringWidth(cSDL_Font *font, const char *s)
 {
 	int width = 0;
 	int max_width = 0;
@@ -117,17 +117,17 @@ int nSDL_GetStringWidth(nSDL_Font *font, const char *s)
 				max_width = width;
 			width = 0;
 		} else
-			width += NSP_CHAR_WIDTH(font, *s) + font->hspacing;
+			width += PRZ_CHAR_WIDTH(font, *s) + font->hspacing;
 	} while ( *s++ );
 	return(max_width - font->hspacing);
 }
 
-int nSDL_GetStringHeight(nSDL_Font *font, const char *s)
+int cSDL_GetStringHeight(cSDL_Font *font, const char *s)
 {
 	int height = 0;
 	do {
 		if ( *s == '\n' || *s == '\0' )
-			height += NSP_FONT_HEIGHT + font->vspacing;
+			height += PRZ_FONT_HEIGHT + font->vspacing;
 	} while ( *s++ );
 	return(height - font->vspacing);
 }
diff --git a/src/video/prizm/SDL_prizmfonts.h b/src/video/prizm/SDL_prizmfonts.h
index d61ca66..75e259f 100644
--- a/src/video/prizm/SDL_prizmfonts.h
+++ b/src/video/prizm/SDL_prizmfonts.h
@@ -1,11 +1,11 @@
-#ifndef _SDL_tinspirefonts_h
-#define _SDL_tinspirefonts_h
+#ifndef _SDL_prizmfonts_h
+#define _SDL_prizmfonts_h
 
-#define NSP_CHAR_WIDTH(font, c) (font->monospaced ? NSP_FONT_WIDTH : font->char_width[(int)c])
+#define PRZ_CHAR_WIDTH(font, c) (font->monospaced ? PRZ_FONT_WIDTH : font->char_width[(int)c])
 
 
-static unsigned char nsp_font_charmaps[NSP_NUMFONTS][2048] = {
-	/* NSDL_FONT_THIN */
+static unsigned char prz_font_charmaps[PRZ_NUMFONTS][2048] = {
+	/* cSDL_FONT_THIN */
 	{
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Char 000 (.)
 		0x7E, 0x81, 0xA5, 0x81, 0xBD, 0x99, 0x81, 0x7E, // Char 001 (.)
@@ -265,7 +265,7 @@ static unsigned char nsp_font_charmaps[NSP_NUMFONTS][2048] = {
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  // Char 255 (.)
 	},
 
-	/* NSDL_FONT_SPACE */
+	/* cSDL_FONT_SPACE */
 	{
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Char 000 (.)
 		0x20, 0x7E, 0x81, 0xA5, 0x81, 0xBD, 0x99, 0x81, // Char 001 (.)
@@ -525,7 +525,7 @@ static unsigned char nsp_font_charmaps[NSP_NUMFONTS][2048] = {
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  // Char 255 (.)
 	},
 
-	/* NSDL_FONT_VGA */
+	/* cSDL_FONT_VGA */
 	{
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Char 000 (.)
 		0x7E, 0x81, 0xA5, 0x81, 0xBD, 0x99, 0x81, 0x7E, // Char 001 (.)
@@ -785,7 +785,7 @@ static unsigned char nsp_font_charmaps[NSP_NUMFONTS][2048] = {
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  // Char 255 (.)
 	},
 
-	/* NSDL_FONT_FANTASY */
+	/* cNSDL_FONT_FANTASY */
 	{
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Char 000 (.)
 		0x7E, 0x81, 0xA5, 0x81, 0xBD, 0x99, 0x81, 0x7E, // Char 001 (.)
@@ -1045,7 +1045,7 @@ static unsigned char nsp_font_charmaps[NSP_NUMFONTS][2048] = {
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  // Char 255 (.)
 	},
 
-	/* NSDL_FONT_TINYTYPE */
+	/* cSDL_FONT_TINYTYPE */
 	{
 		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Char 000 (.)
 		0x7E, 0x81, 0xA5, 0x81, 0xA5, 0x99, 0x81, 0x7E, // Char 001 (.)
@@ -1306,4 +1306,4 @@ static unsigned char nsp_font_charmaps[NSP_NUMFONTS][2048] = {
 	}
 };
 
-#endif /* _SDL_tinspirefonts_h */
+#endif
diff --git a/src/video/prizm/SDL_prizmnti.c b/src/video/prizm/SDL_prizmnti.c
index 04fb600..31ab419 100644
--- a/src/video/prizm/SDL_prizmnti.c
+++ b/src/video/prizm/SDL_prizmnti.c
@@ -32,7 +32,7 @@ SDL_Surface *nSDL_LoadImage(Uint16 *data)
 	SDL_LockSurface(image);
 	for ( i = 0; i < nti_info.height; ++i )
 		for( j = 0; j < nti_info.width; ++j)
-			nSDL_SetPixel(image, j, i, data[j + (nti_info.width * i)]);
+			cSDL_SetPixel(image, j, i, data[j + (nti_info.width * i)]);
 	SDL_UnlockSurface(image);
 	return(image);
 }
diff --git a/src/video/prizm/SDL_prizmvideo.c b/src/video/prizm/SDL_prizmvideo.c
index 4516f2e..c43cf0d 100644
--- a/src/video/prizm/SDL_prizmvideo.c
+++ b/src/video/prizm/SDL_prizmvideo.c
@@ -94,11 +94,13 @@ static SDL_VideoDevice *PRZ_CreateDevice(int devindex)
 	return device;
 }
 
+
 VideoBootStrap PRZ_bootstrap = {
 	"prizm", "SDL Casio PRIZM video driver",
 	PRZ_Available, PRZ_CreateDevice
 };
 
+
 static int PRZ_VideoInit(_THIS, SDL_PixelFormat *vformat)
 {
 	this->hidden->cx = (int) true;
@@ -142,8 +144,8 @@ static SDL_Surface *PRZ_SetVideoMode(_THIS, SDL_Surface *current,
 		rmask = PRZ_RMASK16;
 		gmask = PRZ_GMASK16;
 		bmask = PRZ_BMASK16;
-	} 
-	
+	}
+
 	if ( this->hidden->buffer ) {
 		SDL_free( this->hidden->buffer );
 	}
@@ -215,7 +217,8 @@ static void PRZ_UpdateRects(_THIS, int numrects, SDL_Rect *rects)
 
 		src_addr = PRZ_PIXEL_ADDR(SDL_VideoSurface->pixels, rect->x, rect->y,
 					  SDL_VideoSurface->pitch, SDL_VideoSurface->format->BytesPerPixel);
-		dst_addr = PRZ_PIXEL_ADDR(this->hidden->buffer2, rect->x, rect->y, 2 * DWIDTH, 2);
+		//dst_addr = PRZ_PIXEL_ADDR(this->hidden->buffer2, rect->x, rect->y, 2 * DWIDTH, 2);
+		dst_addr = PRZ_PIXEL_ADDR(gint_vram, rect->x, rect->y, 2 * DWIDTH, 2);
 		dst_addr += this->hidden->offset;
 
 		odd_left = (this->hidden->win_x + rect->x) & 1;
@@ -232,7 +235,7 @@ static void PRZ_UpdateRects(_THIS, int numrects, SDL_Rect *rects)
 				PRZ_DRAW_LOOP(
 					SDL_memcpy(dst_addr, src_addr, row_bytes);
 				);
-			} 
+			}
 			dupdate();
 		}
 	}