/*
 *  desktop -- The 3dfx Desktop Demo 
 *  COPYRIGHT 3DFX INTERACTIVE, INC. 1999
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include "basics.h"
#include "mathutil.h"

#define __declspec(x)


static unsigned short g_fpu_cw_old, g_fpu_cw_new;
void SetFPUprecision(FPUprecision precision)
{

}

void RestoreFPUprecision()
{
}

// takes a normalized (a, r, g, b) and returns a 32-bit packed argb (8-bits per component)
// the (a, r, g, b) must be in the range [0.0 - 255.0/256.0] otherwise the results will be wrong
// NOTE: it is essential that the fpu be in 53-bit or 64-bit precision mode
const unsigned long MAGIC_A = 0x49c00000; // 2^(51-8-24) + 2^(52-8-24)
const unsigned long MAGIC_R = 0x4dc00000; // 2^(51-8-16) + 2^(52-8-16)
const unsigned long MAGIC_G = 0x51c00000; // 2^(51-8- 8) + 2^(52-8- 8)
const unsigned long MAGIC_B = 0x55c00000; // 2^(51-8- 0) + 2^(52-8- 0)
const float SCALE_255 = 255.0f/256.0f;
#ifdef USE_ASM
__declspec(naked) unsigned long PackARGB(float a, float r, float g, float b)
{
	__asm
	{
		fld			dword ptr [esp + 4]
		// a
		fld			dword ptr [esp + 8]
		// r
		// a

		fld			dword ptr [SCALE_255]
		// 255.0/256.0
		// r
		// a
		fmul		st(2), st
		// 255.0/256.0
		// r
		// a' (2)
		fmul		st(1), st
		// 255.0/256.0
		// r' (2)
		// a' (1)

		fld			dword ptr [esp + 12]
		// g
		// 255.0/256.0
		// r' (1)
		// a' (0)
		fmul		st, st(1)
		// g' (2)
		// 255.0/256.0
		// r' (0)
		// a' (0)
		fld			dword ptr [esp + 16]
		// b
		// g' (2)
		// 255.0/256.0
		// r' (0)
		// a' (0)

		fmulp		st(2), st
		// g' (1)
		// b' (2)
		// r' (0)
		// a' (0)
		fld			dword ptr [MAGIC_A]
		// MAGIC_A
		// g' (0)
		// b' (1)
		// r' (0)
		// a' (0)
		faddp		st(4), st
		// g' (0)
		// b' (1)
		// r' (0)
		// a' + MAGIC_A (2)

		fadd		dword ptr [MAGIC_G]
		// g' + MAGIC_G (2)
		// b' (0)
		// r' (0)
		// a' + MAGIC_A (1)
		fld			dword ptr [MAGIC_R]
		// MAGIC_R
		// g' + MAGIC_G (1)
		// b' (0)
		// r' (0)
		// a' + MAGIC_A (0)
		faddp		st(3), st
		// g' + MAGIC_G (0)
		// b' (0)
		// r' + MAGIC_R (2)
		// a' + MAGIC_A (0)

		fld			dword ptr [MAGIC_B]
		// MAGIC_B
		// g' + MAGIC_G (0)
		// b' (0)
		// r' + MAGIC_R (1)
		// a' + MAGIC_A (0)
		faddp		st(2), st
		// g' + MAGIC_G (0)
		// b' + MAGIC_B (2)
		// r' + MAGIC_R (0)
		// a' + MAGIC_A (0)
		fxch		st(3)
		// a' + MAGIC_A (0)
		// b' + MAGIC_B (2)
		// r' + MAGIC_R (0)
		// g' + MAGIC_G (0)

		fstp		qword ptr [esp + 4]
		mov			eax, dword ptr [esp + 4] // eax = a

		fstp		qword ptr [esp + 12]
		and			eax, 0xff000000
		mov			edx, dword ptr [esp + 12] // edx = b

		fstp		qword ptr [esp + 4]
		or			eax, edx
		mov			ecx, dword ptr [esp + 4] // ecx = r

		fstp		qword ptr [esp + 12]
		and			ecx, 0x00ff0000
		mov			edx, dword ptr [esp + 12] // edx = g

		and			edx, 0x0000ff00
		or			eax, ecx
		or			eax, edx

		ret
	}
	/*
	__asm
	{
		fld			dword ptr [esp + 4]
		// a
		fmul		dword ptr [SCALE_255]
		// a (2)
		fld			dword ptr [esp + 8]
		// r
		// a (1)
		fmul		dword ptr [SCALE_255]
		// r (2)
		// a (0)
		fld			dword ptr [esp + 12]
		// g
		// r (1)
		// a (0)
		fmul		dword ptr [SCALE_255]
		// g (2)
		// r (0)
		// a (0)
		fld			dword ptr [esp + 16]
		// b
		// g (1)
		// r (0)
		// a (0)
		fmul		dword ptr [SCALE_255]
		// b (2)
		// g (0)
		// r (0)
		// a (0)
		fxch		st(3)
		// a (0)
		// g (0)
		// r (0)
		// b (2)
		fadd		dword ptr [MAGIC_A]
		// a + MAGIC_A (2)
		// g (0)
		// r (0)
		// b (1)
		fxch		st(2)
		// r (0)
		// g (0)
		// a + MAGIC_A (2)
		// b (1)
		fadd		dword ptr [MAGIC_R]
		// r + MAGIC_R (2)
		// g (0)
		// a + MAGIC_A (1)
		// b (0)
		fxch		st(1)
		// g (0)
		// r + MAGIC_R (2)
		// a + MAGIC_A (1)
		// b (0)
		fadd		dword ptr [MAGIC_G]
		// g + MAGIC_G (2)
		// r + MAGIC_R (1)
		// a + MAGIC_A (0)
		// b (0)
		fxch		st(3)
		// b (0)
		// r + MAGIC_R (1)
		// a + MAGIC_A (0)
		// g + MAGIC_G (2)
		fadd		dword ptr [MAGIC_B]
		// b + MAGIC_B (2)
		// r + MAGIC_R (0)
		// a + MAGIC_A (0)
		// g + MAGIC_G (1)
		fxch		st(2)
		// a + MAGIC_A (0)
		// r + MAGIC_R (0)
		// b + MAGIC_B (2)
		// g + MAGIC_G (1)
		fstp		qword ptr [esp + 4]
		fstp		qword ptr [esp + 12]
		mov			eax, dword ptr [esp + 4]
		mov			edx, dword ptr [esp + 12]
		fstp		qword ptr [esp + 4]
		fstp		qword ptr [esp + 12]
		and			eax, 0xff000000
		and			edx, 0x00ff0000
		or			eax, edx
		mov			ecx, dword ptr [esp + 4]
		mov			edx, dword ptr [esp + 12]
//		and			ecx, 0x000000ff
		and			edx, 0x0000ff00
		or			eax, ecx
		or			eax, edx

		ret
	}
	__asm
	{
		push		ebx
		mov			ecx, 127

		mov			eax, [esp + 8] // alpha
		push		edi

		mov			edx, eax
		push		esi

		shr			edx, 23
		and			eax, (1<<23)-1

		mov			edi, 31
		sub			ecx, edx

		add			eax, (1<<23)-1
		cmp			edi, ecx

		sbb			edi, edi
		mov			ebx, [esp + 20] // red

		mov			edx, ebx
		or			ecx, edi

		shr			eax, cl

		shl			eax, 24-(23-8)
		and			ebx, (1<<23)-1

		shr			edx, 23
		mov			ecx, 127

		and			eax, 0xff000000
		mov			edi, 31

		sub			ecx, edx
		add			ebx, (1<<23)-1

		mov			esi, [esp + 24] // green
		cmp			edi, ecx

		sbb			edi, edi
		mov			edx, esi

		or			ecx, edi
		and			esi, (1<<23)-1

		shr			ebx, cl

		shl			ebx, 16-(23-8)
		mov			ecx, 127+(23-8)-8

		shr			edx, 23
		and			ebx, 0x00ff0000

		or			eax, ebx
		add			esi, (1<<23)-1

		sub			ecx, edx
		mov			edi, 31

		mov			ebx, [esp + 28] // blue
		cmp			edi, ecx

		sbb			edi, edi
		mov			edx, ebx

		shr			ebx, 23
		or			ecx, edi

		shr			esi, cl

		and			edx, (1<<23)-1
		and			esi, 0x0000ff00

		mov			ecx, 127+(23-8)
		or			eax, esi

		add			edx, (1<<23)-1
		sub			ecx, ebx

		pop			esi
		mov			ebx, 31

		pop			edi
		cmp			ebx, ecx

		sbb			ebx, ebx

		or			ecx, ebx

		shr			edx, cl

		pop			ebx
		or			eax, edx

		ret
	}
	*/
}

// takes a normalized (a, i) and returns a 32-bit packed argb (8-bits per component)
// i = r = g = b, i.e. grey scale
// the (a, i) must be in the range [0.0 - 1.0] otherwise the results will be wrong
__declspec(naked) unsigned long PackARGBIntensity(float a, float i)
{
	__asm
	{
		fld			dword ptr [esp + 4]
		// a
		fld			dword ptr [esp + 8]
		// i
		// a

		fxch		st(1)
		// a
		// i
		fld			dword ptr [SCALE_255]
		// 255.0/256.0
		// a
		// i
		fmul		st(2), st
		// 255.0/256.0
		// a
		// i' (2)

		fmulp		st(1), st
		// a' (2)
		// i' (1)
		fld			dword ptr [MAGIC_B]
		// MAGIC_B
		// a' (1)
		// i' (0)
		faddp		st(2), st
		// a' (0)
		// i' + MAGIC_B (2)

		fadd		dword ptr [MAGIC_A]
		// a' + MAGIC_A (2)
		// i' + MAGIC_B (1)
		fxch		st(1)
		// i' + MAGIC_B (1)
		// a' + MAGIC_A (2)

		fstp		qword ptr [esp + 4]
		mov			edx, dword ptr [esp + 4]
		mov			ecx, dword ptr [esp + 4]

		fstp		qword ptr [esp + 4]
		shl			edx, 8
		mov			eax, dword ptr [esp + 4]

		or			edx, ecx
		and			eax, 0xff000000
		shl			ecx, 16

		or			edx, ecx
		or			eax, edx

		ret
	}
	/*
	__asm
	{
		push		ebx
		mov			ecx, 127+(23-8)

		mov			eax, [esp + 12] // eax = i
		push		edi

		mov			ebx, eax
		and			eax, (1<<23)-1 // eax = frac

		shr			ebx, 23
		add			eax, (1<<23)-1

		sub			ecx, ebx // ecx = exp
		mov			ebx, 31

		mov			edx, [esp + 12] // edx = a
		cmp			ebx, ecx

		sbb			ebx, ebx
		and			edx, (1<<23)-1 // edx = frac

		add			edx, (1<<23)-1
		or			ecx, ebx

		shr			eax, cl

		mov			ebx, [esp + 12]
		mov			ecx, 127

		shr			ebx, 23
		mov			edi, 31

		sub			ecx, ebx // ecx = exp
		mov			ebx, eax

		shl			eax, 8
		cmp			edi, ecx

		sbb			edi, edi
		or			eax, ebx

		shl			eax, 8
		or			ecx, edi

		shr			edx, cl

		shl			edx, 24-(23-8)
		or			eax, ebx

		pop			edi
		and			edx, 0xff000000

		pop			ebx
		or			eax, edx

		ret
	}
	*/
}
#endif

#ifndef USE_ASM
// Gary Tarolli's clever inverse square root technique
float fsqrt_inv(float f)
{
	long i;
	float x2, y;

	x2 = 0.5f*f;
	i = *(long *)&f;
	i = 0x5f3759df - (i>>1);
	y = *(float *)&i;

	// repeat this iteration for more accuracy
	y = 1.5f*y - (x2*y * y*y);

	return y;
}
#else
const float ONE_HALF = 0.5f;
const float THREE_HALVES = 1.5f;
__declspec(naked) float fsqrt_inv(float f)
{
	__asm // 18 cycles
	{
		fld			dword ptr [esp + 4]
		// f
		fmul		dword ptr [ONE_HALF]
		// x2 = 0.5f*f

		mov			eax, [esp + 4]
		mov			ecx, 0x5f3759df

		shr			eax, 1

		sub			ecx, eax

		mov			[esp + 4], ecx

		fmul		dword ptr [esp + 4]
		// x2*y
		fld			dword ptr [esp + 4]
		// y
		// x2*y
		fmul		dword ptr [esp + 4]
		// y*y
		// x2*y
		fld			dword ptr [THREE_HALVES]
		// 1.5f
		// y*y
		// x2*y
		fmul		dword ptr [esp + 4]
		// 1.5f*y
		// y*y
		// x2*y
		fxch		st(2)
		// x2*y
		// y*y
		// 1.5f*y
		// ******** stall 1 clock ********
		fmulp		st(1), st
		// x2*y * y*y
		// 1.5f*y
		// ******** stall 2 clocks ********
		fsubp		st(1), st
		// y = 1.5f*y - (x2*y * y*y)

		ret
	}
}
#endif


__declspec(naked) float DotProduct(const float *a, const float *b)
{
#ifndef USE_ASM
	return  a[X]*b[X] + a[Y]*b[Y] + a[Z]*b[Z];
#else
	__asm
	{
		mov			ecx, [esp + 4]
		mov			edx, [esp + 8]

		fld			dword ptr [ecx]
		// a[X]
		fmul		dword ptr [edx]
		// a[X]*b[X] (2)
		fld			dword ptr [ecx + 4]
		// a[Y]
		// a[X]*b[X] (1)
		fmul		dword ptr [edx + 4]
		// a[Y]*b[Y] (2)
		// a[X]*b[X] (1)
		fld			dword ptr [ecx + 8]
		// a[Z]
		// a[Y]*b[Y] (1)
		// a[X]*b[X] (0)
		fmul		dword ptr [edx + 8]
		// a[Z]*b[Z] (2)
		// a[Y]*b[Y] (0)
		// a[X]*b[X] (0)
		fxch		st(2)
		// a[X]*b[X] (0)
		// a[Y]*b[Y] (0)
		// a[Z]*b[Z] (2)
		faddp		st(1), st
		// a[X]*b[X] + a[Y]*b[Y] (2)
		// a[Z]*b[Z] (1)
		// ******** stall 2 cycles ********
		faddp		st(1), st

		ret
	}
#endif
}

__declspec(naked) void CrossProduct(float *v, const float *v1, const float *v2)
{
#ifndef USE_ASM
  v[X] = v1[Y]*v2[Z] - v1[Z]*v2[Y];// = A - B
  v[Y] = v1[Z]*v2[X] - v1[X]*v2[Z];// = C - D
  v[Z] = v1[X]*v2[Y] - v1[Y]*v2[X];// = E - F
#else
	__asm
	{
		mov			eax, [esp + 4]
		mov			ecx, [esp + 8]

		mov			edx, [esp + 12]

		fld			dword ptr [ecx + 4]
		// v1[Y]
		fmul		dword ptr [edx + 8]
		// v1[Y]*v2[Z] (2)
		fld			dword ptr [ecx + 8]
		// v1[Z]
		// v1[Y]*v2[Z] (1)
		fmul		dword ptr [edx + 4]
		// v1[Z]*v2[Y] (2)
		// v1[Y]*v2[Z] (0)
		fld			dword ptr [ecx + 8]
		// v1[Z]
		// v1[Z]*v2[Y] (1)
		// v1[Y]*v2[Z] (0)
		fmul		dword ptr [edx]
		// v1[Z]*v2[X] (2)
		// v1[Z]*v2[Y] (0)
		// v1[Y]*v2[Z] (0)
		fld			dword ptr [ecx]
		// v1[X]
		// v1[Z]*v2[X] (1)
		// v1[Z]*v2[Y] (0)
		// v1[Y]*v2[Z] (0)
		fmul		dword ptr [edx + 8]
		// v1[X]*v2[Z] (2)
		// v1[Z]*v2[X] (0)
		// v1[Z]*v2[Y] (0)
		// v1[Y]*v2[Z] (0)
		fld			dword ptr [ecx]
		// v1[X]
		// v1[X]*v2[Z] (1)
		// v1[Z]*v2[X] (0)
		// v1[Z]*v2[Y] (0)
		// v1[Y]*v2[Z] (0)
		fmul		dword ptr [edx + 4]
		// v1[X]*v2[Y] (2)
		// v1[X]*v2[Z] (0)
		// v1[Z]*v2[X] (0)
		// v1[Z]*v2[Y] (0)
		// v1[Y]*v2[Z] (0)
		fld			dword ptr [ecx + 4]
		// v1[Y]
		// v1[X]*v2[Y] (1)
		// v1[X]*v2[Z] (0)
		// v1[Z]*v2[X] (0)
		// v1[Z]*v2[Y] (0)
		// v1[Y]*v2[Z] (0)
		fmul		dword ptr [edx]
		// v1[Y]*v2[X] = F (2)
		// v1[X]*v2[Y] = E (0)
		// v1[X]*v2[Z] = D (0)
		// v1[Z]*v2[X] = C (0)
		// v1[Z]*v2[Y] = B (0)
		// v1[Y]*v2[Z] = A (0)
		fxch		st(5)
		// v1[Y]*v2[Z] = A (0)
		// v1[X]*v2[Y] = E (0)
		// v1[X]*v2[Z] = D (0)
		// v1[Z]*v2[X] = C (0)
		// v1[Z]*v2[Y] = B (0)
		// v1[Y]*v2[X] = F (2)
		fsubrp	st(4), st
		// v1[X]*v2[Y] = E (0)
		// v1[X]*v2[Z] = D (0)
		// v1[Z]*v2[X] = C (0)
		// A - B (2)
		// v1[Y]*v2[X] = F (1)
		fxch		st(2)
		// v1[Z]*v2[X] = C (0)
		// v1[X]*v2[Z] = D (0)
		// v1[X]*v2[Y] = E (0)
		// A - B (2)
		// v1[Y]*v2[X] = F (1)
		fsubrp	st(1), st
		// C - D (2)
		// v1[X]*v2[Y] = E (0)
		// A - B (1)
		// v1[Y]*v2[X] = F (0)
		fxch		st(1)
		// v1[X]*v2[Y] = E (0)
		// C - D (2)
		// A - B (1)
		// v1[Y]*v2[X] = F (0)
		fsubrp	st(3), st
		// C - D (1)
		// A - B (0)
		// E - F (2)
		fxch		st(1)
		// A - B (0)
		// C - D (1)
		// E - F (2)
		// ******** stall 1 cycle ********
		// (result must be ready one cycle in advance)
		fstp		dword ptr [eax]
		fstp		dword ptr [eax + 4]
		fstp		dword ptr [eax + 8]

		ret
	}
#endif
}

// v0, v1, and v2 must be structures with the x and y component
// as the first 2 32-bit members
// if the vertices v0, v1, and v2 are in counter clockwise order
// the return value will be positive, otherwise it'll be negative
// if they're collinear, the return value will be zero
__declspec(naked) float NormZ(const float *v0, const float *v1, const float *v2)
{
#ifndef USE_ASM
	// (v2-v1) x (v0-v1)
	return (v2[X] - v1[X])*(v0[Y] - v1[Y]) - (v2[Y] - v1[Y])*(v0[X] - v1[X]);
#else
	__asm
	{
		mov			ecx, [esp + 8] // ecx = v1
		mov			edx, [esp + 12] // edx = v2

		mov			eax, [esp + 4] // eax = v0

		fld			dword ptr [edx]
		// v2.x
		fsub		dword ptr [ecx]
		// v2.x - v1.x (2)
		fld			dword ptr [eax + 4]
		// v0.y
		// v2.x - v1.x (1)
		fsub		dword ptr [ecx + 4]
		// v0.y - v1.y (2)
		// v2.x - v1.x (0)
		fld			dword ptr [edx + 4]
		// v2.y
		// v0.y - v1.y (1)
		// v2.x - v1.x (0)
		fsub		dword ptr [ecx + 4]
		// v2.y - v1.y (2)
		// v0.y - v1.y (0)
		// v2.x - v1.x (0)
		fld			dword ptr [eax]
		// v0.x
		// v2.y - v1.y (1)
		// v0.y - v1.y (0)
		// v2.x - v1.x (0)
		fsub		dword ptr [ecx]
		// v0.x - v1.x (2)
		// v2.y - v1.y (0)
		// v0.y - v1.y (0)
		// v2.x - v1.x (0)
		fxch		st(2)
		// v0.y - v1.y (0)
		// v2.y - v1.y (0)
		// v0.x - v1.x (2)
		// v2.x - v1.x (0)
		fmulp		st(3), st
		// v2.y - v1.y (0)
		// v0.x - v1.x (1)
		// (v0.y-v1.y)*(v2.x-v1.x) (3)
		// ******** STALL 1 clock ********
		// v2.y - v1.y (0)
		// v0.x - v1.x (0)
		// (v0.y-v1.y)*(v2.x-v1.x) (2)
		fmulp		st(1), st
		// (v2.y-v1.y)*(v0.x-v1.x) (2)
		// (v0.y-v1.y)*(v2.x-v1.x) (1)
		// ******** STALL 2 clocks ********
		fsubp		st(1), st

		ret
	}
#endif
}

float Magnitude(const float *v)
{
	return fsqrt(SQR(v[X]) + SQR(v[Y]) + SQR(v[Z]));
}

float Distance(const float *v0, const float *v1)
{
	return fsqrt(SQR(v1[X]-v0[X]) + SQR(v1[Y]-v0[Y]) + SQR(v1[Z]-v0[Z]));
}

void Normalize(float *v)
{
	float m = 1.0f/fsqrt(SQR(v[X]) + SQR(v[Y]) + SQR(v[Z]));
	v[X] *= m;
	v[Y] *= m;
	v[Z] *= m;
}

void FastApproxNormalize(float *v)
{
	float m = fsqrt_inv(SQR(v[X]) + SQR(v[Y]) + SQR(v[Z]));
	v[X] *= m;
	v[Y] *= m;
	v[Z] *= m;
}

// rotates point about the x axis by angle (in radians)
void RotateByXAxis(Vector res, Vector point, float angle)
{
	float cos_angle, sin_angle;
	float y, z;

	fsincos(angle, &sin_angle, &cos_angle);

	y = (point[Y] * cos_angle) - (point[Z] * sin_angle);
	z = (point[Y] * sin_angle) + (point[Z] * cos_angle);

	res[X] = point[X];
	res[Y] = y;
	res[Z] = z;
	res[W] = point[W];
}

// rotates point about the y axis by angle (in radians)
void RotateByYAxis(Vector res, Vector point, float angle)
{
	float cos_angle, sin_angle;
	float x, z;

	fsincos(angle, &sin_angle, &cos_angle);

	x = (point[X] * cos_angle) - (point[Z] * sin_angle);
	z = (point[X] * sin_angle) + (point[Z] * cos_angle);

	res[X] = x;
	res[Y] = point[Y];
	res[Z] = z;
	res[W] = point[W];
}

// rotates point about the z axis by angle (in radians)
void RotateByZAxis(Vector res, Vector point, float angle)
{
	float cos_angle, sin_angle;
	float x, y;

	fsincos(angle, &sin_angle, &cos_angle);

	x = (point[X] * cos_angle) - (point[Y] * sin_angle);
	y = (point[X] * sin_angle) + (point[Y] * cos_angle);

	res[X] = x;
	res[Y] = y;
	res[Z] = point[Z];
	res[W] = point[W];
}

// rotates point about axis by angle (in radians)
// using a quaternian formula
void RotateByAxis(float *res, const float *point, float angle, const float *axis)
{
	float cos_angle, sin_angle, cos_angle_sqr, dp;
	Vector v, vp, vvp;

	fsincos(0.5f*angle, &sin_angle, &cos_angle);
	cos_angle_sqr = cos_angle*cos_angle;

	v[X] = axis[X]*sin_angle;
	v[Y] = axis[Y]*sin_angle;
	v[Z] = axis[Z]*sin_angle;

	CrossProduct(vp, v, point);
	CrossProduct(vvp, v, vp);
	dp = DotProduct(point, v);

	res[X] = cos_angle_sqr*point[X] + v[X]*dp + 2.0f*cos_angle*vp[X] + vvp[X];
	res[Y] = cos_angle_sqr*point[Y] + v[Y]*dp + 2.0f*cos_angle*vp[Y] + vvp[Y];
	res[Z] = cos_angle_sqr*point[Z] + v[Z]*dp + 2.0f*cos_angle*vp[Z] + vvp[Z];
}

void TransposeMatrix(Matrix res, Matrix m)
{
	Matrix tmp;
	tmp[0][0] = m[0][0]; tmp[0][1] = m[1][0]; tmp[0][2] = m[2][0]; tmp[0][3] = m[3][0];
	tmp[1][0] = m[0][1]; tmp[1][1] = m[1][1]; tmp[1][2] = m[2][1]; tmp[1][3] = m[3][1];
	tmp[2][0] = m[0][2]; tmp[2][1] = m[1][2]; tmp[2][2] = m[2][2]; tmp[2][3] = m[3][2];
	tmp[3][0] = m[0][3]; tmp[3][1] = m[1][3]; tmp[3][2] = m[2][3]; tmp[3][3] = m[3][3];
	MatrixCopy(res, tmp);
}

void MatMultMat4x4(Matrix res, Matrix m1, Matrix m2)
{
	Matrix tmp;
	tmp[0][0] = m1[0][0]*m2[0][0] + m1[0][1]*m2[1][0] + m1[0][2]*m2[2][0] + m1[0][3]*m2[3][0];
	tmp[0][1] = m1[0][0]*m2[0][1] + m1[0][1]*m2[1][1] + m1[0][2]*m2[2][1] + m1[0][3]*m2[3][1];
	tmp[0][2] = m1[0][0]*m2[0][2] + m1[0][1]*m2[1][2] + m1[0][2]*m2[2][2] + m1[0][3]*m2[3][2];
	tmp[0][3] = m1[0][0]*m2[0][3] + m1[0][1]*m2[1][3] + m1[0][2]*m2[2][3] + m1[0][3]*m2[3][3];
	tmp[1][0] = m1[1][0]*m2[0][0] + m1[1][1]*m2[1][0] + m1[1][2]*m2[2][0] + m1[1][3]*m2[3][0];
	tmp[1][1] = m1[1][0]*m2[0][1] + m1[1][1]*m2[1][1] + m1[1][2]*m2[2][1] + m1[1][3]*m2[3][1];
	tmp[1][2] = m1[1][0]*m2[0][2] + m1[1][1]*m2[1][2] + m1[1][2]*m2[2][2] + m1[1][3]*m2[3][2];
	tmp[1][3] = m1[1][0]*m2[0][3] + m1[1][1]*m2[1][3] + m1[1][2]*m2[2][3] + m1[1][3]*m2[3][3];
	tmp[2][0] = m1[2][0]*m2[0][0] + m1[2][1]*m2[1][0] + m1[2][2]*m2[2][0] + m1[2][3]*m2[3][0];
	tmp[2][1] = m1[2][0]*m2[0][1] + m1[2][1]*m2[1][1] + m1[2][2]*m2[2][1] + m1[2][3]*m2[3][1];
	tmp[2][2] = m1[2][0]*m2[0][2] + m1[2][1]*m2[1][2] + m1[2][2]*m2[2][2] + m1[2][3]*m2[3][2];
	tmp[2][3] = m1[2][0]*m2[0][3] + m1[2][1]*m2[1][3] + m1[2][2]*m2[2][3] + m1[2][3]*m2[3][3];
	tmp[3][0] = m1[3][0]*m2[0][0] + m1[3][1]*m2[1][0] + m1[3][2]*m2[2][0] + m1[3][3]*m2[3][0];
	tmp[3][1] = m1[3][0]*m2[0][1] + m1[3][1]*m2[1][1] + m1[3][2]*m2[2][1] + m1[3][3]*m2[3][1];
	tmp[3][2] = m1[3][0]*m2[0][2] + m1[3][1]*m2[1][2] + m1[3][2]*m2[2][2] + m1[3][3]*m2[3][2];
	tmp[3][3] = m1[3][0]*m2[0][3] + m1[3][1]*m2[1][3] + m1[3][2]*m2[2][3] + m1[3][3]*m2[3][3];
	MatrixCopy(res, tmp);
}

void MatMultMat3x4(Matrix res, Matrix m1, Matrix m2)
{
	Matrix tmp;
	tmp[0][0] = m1[0][0]*m2[0][0] + m1[0][1]*m2[1][0] + m1[0][2]*m2[2][0];
	tmp[0][1] = m1[0][0]*m2[0][1] + m1[0][1]*m2[1][1] + m1[0][2]*m2[2][1];
	tmp[0][2] = m1[0][0]*m2[0][2] + m1[0][1]*m2[1][2] + m1[0][2]*m2[2][2];
	tmp[0][3] = m1[0][0]*m2[0][3] + m1[0][1]*m2[1][3] + m1[0][2]*m2[2][3] + m1[0][3];
	tmp[1][0] = m1[1][0]*m2[0][0] + m1[1][1]*m2[1][0] + m1[1][2]*m2[2][0];
	tmp[1][1] = m1[1][0]*m2[0][1] + m1[1][1]*m2[1][1] + m1[1][2]*m2[2][1];
	tmp[1][2] = m1[1][0]*m2[0][2] + m1[1][1]*m2[1][2] + m1[1][2]*m2[2][2];
	tmp[1][3] = m1[1][0]*m2[0][3] + m1[1][1]*m2[1][3] + m1[1][2]*m2[2][3] + m1[1][3];
	tmp[2][0] = m1[2][0]*m2[0][0] + m1[2][1]*m2[1][0] + m1[2][2]*m2[2][0];
	tmp[2][1] = m1[2][0]*m2[0][1] + m1[2][1]*m2[1][1] + m1[2][2]*m2[2][1];
	tmp[2][2] = m1[2][0]*m2[0][2] + m1[2][1]*m2[1][2] + m1[2][2]*m2[2][2];
	tmp[2][3] = m1[2][0]*m2[0][3] + m1[2][1]*m2[1][3] + m1[2][2]*m2[2][3] + m1[2][3];
	tmp[3][0] = 0.0f;
	tmp[3][1] = 0.0f;
	tmp[3][2] = 0.0f;
	tmp[3][3] = 1.0f;
	MatrixCopy(res, tmp);
}

__declspec(naked) void MatMultVec3x3_2(Vector res, Matrix m, Vector v)
{
#ifndef USE_ASM
	res[0] = m[0][0]*v[0] + m[0][1]*v[1] + m[0][2]*v[2];
	res[1] = m[1][0]*v[0] + m[1][1]*v[1] + m[1][2]*v[2];
#else
	__asm
	{
		mov			eax, [esp + 4] // eax = res
		mov			ecx, [esp + 8] // ecx = m

		mov			edx, [esp + 12] // edx = v

		fld			dword ptr [ecx]
		// m00
		fmul		dword ptr [edx]
		// m00*v0
		fld			dword ptr [ecx + 4]
		// m01
		// m00*v0
		fmul		dword ptr [edx + 4]
		// m01*v1
		// m00*v0
		fld			dword ptr [ecx + 8]
		// m02
		// m01*v1
		// m00*v0
		fmul		dword ptr [edx + 8]
		// m02*v2
		// m01*v1
		// m00*v0
		fld			dword ptr [ecx + 16]
		// m10
		// m02*v2
		// m01*v1
		// m00*v0
		fmul		dword ptr [edx]
		// m10*v0
		// m02*v2
		// m01*v1
		// m00*v0
		fld			dword ptr [ecx + 20]
		// m11
		// m10*v0
		// m02*v2
		// m01*v1
		// m00*v0
		fmul		dword ptr [edx + 4]
		// m11*v1
		// m10*v0
		// m02*v2
		// m01*v1
		// m00*v0
		fld			dword ptr [ecx + 24]
		// m12
		// m11*v1
		// m10*v0
		// m02*v2
		// m01*v1
		// m00*v0
		fmul		dword ptr [edx + 8]
		// m12*v2
		// m11*v1
		// m10*v0
		// m02*v2
		// m01*v1
		// m00*v0
		fxch		st(5)
		// m00*v0
		// m11*v1
		// m10*v0
		// m02*v2
		// m01*v1
		// m12*v2
		faddp		st(4), st
		// m11*v1
		// m10*v0
		// m02*v2
		// m00*v0 + m01*v1
		// m12*v2
		faddp		st(1), st
		// m10*v0 + m11*v1
		// m02*v2
		// m00*v0 + m01*v1
		// m12*v2
		fxch		st(1)
		// m02*v2
		// m10*v0 + m11*v1
		// m00*v0 + m01*v1
		// m12*v2
		faddp		st(2), st
		// m10*v0 + m11*v1
		// m00*v0 + m01*v1 + m02*v2
		// m12*v2
		faddp		st(2), st
		// m00*v0 + m01*v1 + m02*v2
		// m10*v0 + m11*v1 + m12*v2
		fstp		dword ptr [eax]
		fstp		dword ptr [eax + 4]
		ret
	}
#endif
}

// assumes v[W] == 1.0f
__declspec(naked) void MatMultVec3x4_3(Vector res, Matrix m, Vector v)
{
#ifndef USE_ASM
	res[0] = m[0][0]*v[0] + m[0][1]*v[1] + m[0][2]*v[2] + m[0][3];
	res[1] = m[1][0]*v[0] + m[1][1]*v[1] + m[1][2]*v[2] + m[1][3];
	res[2] = m[2][0]*v[0] + m[2][1]*v[1] + m[2][2]*v[2] + m[2][3];
	res[3] = 1.0f;
#else
	__asm // 38 cycles
	{
		mov			eax, [esp + 4]
		mov			ecx, [esp + 8]

		mov			edx, [esp + 12]

		fld			dword ptr [ecx]
		// m00
		fmul		dword ptr [edx]
		// m00*v0 (2)
		fld			dword ptr [ecx + 4]
		// m01
		// m00*v0 (1)
		fmul		dword ptr [edx + 4]
		// m01*v1 (2)
		// m00*v0 (0)
		fxch		st(1)
		// m00*v0 (0)
		// m01*v1 (2)
		fadd		dword ptr [ecx + 12]
		// m00*v0 + m03 (2)
		// m01*v1 (1)
		fld			dword ptr [ecx + 8]
		// m02
		// m00*v0 + m03 (1)
		// m01*v1 (0)
		fmul		dword ptr [edx + 8]
		// m02*v2 (2)
		// m00*v0 + m03 (0)
		// m01*v1 (0)
		fxch		st(2)
		// m01*v1 (0)
		// m00*v0 + m03 (0)
		// m02*v2 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 + m03 (2)
		// m02*v2 (1)
		fld			dword ptr [ecx + 16]
		// m10
		// m00*v0 + m01*v1 + m03 (1)
		// m02*v2 (0)
		fmul		dword ptr [edx]
		// m10*v0 (2)
		// m00*v0 + m01*v1 + m03 (0)
		// m02*v2 (0)
		fxch		st(2)
		// m02*v2 (0)
		// m00*v0 + m01*v1 + m03 (0)
		// m10*v0 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 + m02*v2 + m03 (2)
		// m10*v0 (1)
		fld			dword ptr [ecx + 16 + 4]
		// m11
		// m00*v0 + m01*v1 + m02*v2 + m03 (1)
		// m10*v0 (0)
		fmul		dword ptr [edx + 4]
		// m11*v1 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m10*v0 (0)
		fxch		st(2)
		// m10*v0 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m11*v1 (2)
		fadd		dword ptr [ecx + 16 + 12]
		// m10*v0 + m13 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m11*v1 (1)
		fld			dword ptr [ecx + 16 + 8]
		// m12
		// m10*v0 + m13 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m11*v1 (0)
		fmul		dword ptr [edx + 8]
		// m12*v2 (2)
		// m10*v0 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m11*v1 (0)
		fxch		st(3)
		// m11*v1 (0)
		// m10*v0 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m12*v2 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 + m13 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m12*v2 (1)
		fld			dword ptr [ecx + 32]
		// m20
		// m10*v0 + m11*v1 + m13 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m12*v2 (0)
		fmul		dword ptr [edx]
		// m20*v0 (2)
		// m10*v0 + m11*v1 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m12*v2 (0)
		fxch		st(3)
		// m12*v2 (0)
		// m10*v0 + m11*v1 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m20*v0 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 + m12*v2 + m13 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m20*v0 (1)
		fld			dword ptr [ecx + 32 + 4]
		// m21
		// m10*v0 + m11*v1 + m12*v2 + m13 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m20*v0 (0)
		fmul		dword ptr [edx + 4]
		// m21*v1 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m20*v0 (0)
		fxch		st(3)
		// m20*v0 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m21*v1 (2)
		fadd		dword ptr [ecx + 32 + 12]
		// m20*v0 + m23 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m21*v1 (1)
		fld			dword ptr [ecx + 32 + 8]
		// m22
		// m20*v0 + m23 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m21*v1 (0)
		fmul		dword ptr [edx + 8]
		// m22*v2 (2)
		// m20*v0 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m21*v1 (0)
		fxch		st(4)
		// m21*v1 (0)
		// m20*v0 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m22*v2 (2)
		faddp		st(1), st
		// m20*v0 + m21*v1 + m23 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m22*v2 (1)
		fxch		st(1)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m20*v0 + m21*v1 + m23 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m22*v2 (1)
		fstp		dword ptr [eax + 4]
		// m20*v0 + m21*v1 + m23 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m22*v2 (0)
		faddp		st(2), st
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m20*v0 + m21*v1 + m22*v2 + m23 (2)
		fstp		dword ptr [eax]
		// m20*v0 + m21*v1 + m22*v2 + m23 (0)
		mov			dword ptr [eax + 12], 0x3f800000
		fstp		dword ptr [eax + 8]

		ret
	}
#endif
}

__declspec(naked) void MatMultVec3x4_4(Vector res, Matrix m, Vector v)
{
#ifndef USE_ASM
	res[0] = m[0][0]*v[0] + m[0][1]*v[1] + m[0][2]*v[2] + m[0][3]*v[3];
	res[1] = m[1][0]*v[0] + m[1][1]*v[1] + m[1][2]*v[2] + m[1][3]*v[3];
	res[2] = m[2][0]*v[0] + m[2][1]*v[1] + m[2][2]*v[2] + m[2][3]*v[3];
	res[3] = 1.0f;
#else
	__asm // 44 cycles
	{
		mov			eax, [esp + 4]
		mov			ecx, [esp + 8]

		mov			edx, [esp + 12]

		fld			dword ptr [ecx]
		// m00
		fmul		dword ptr [edx]
		// m00*v0 (2)
		fld			dword ptr [ecx + 4]
		// m01
		// m00*v0 (1)
		fmul		dword ptr [edx + 4]
		// m01*v1 (2)
		// m00*v0 (0)
		fld			dword ptr [ecx + 8]
		// m02
		// m01*v1 (1)
		// m00*v0 (0)
		fmul		dword ptr [edx + 8]
		// m02*v2 (2)
		// m01*v1 (0)
		// m00*v0 (0)
		fxch		st(2)
		// m00*v0 (0)
		// m01*v1 (0)
		// m02*v2 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 (2)
		// m02*v2 (1)
		fld			dword ptr [ecx + 12]
		// m03
		// m00*v0 + m01*v1 (1)
		// m02*v2 (0)
		fmul		dword ptr [edx + 12]
		// m03*v3 (2)
		// m00*v0 + m01*v1 (0)
		// m02*v2 (0)
		fxch		st(2)
		// m02*v2 (0)
		// m00*v0 + m01*v1 (0)
		// m03*v3 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 + m02*v2 (2)
		// m03*v3 (1)
		fld			dword ptr [ecx + 16]
		// m10
		// m00*v0 + m01*v1 + m02*v2 (1)
		// m03*v3 (0)
		fmul		dword ptr [edx]
		// m10*v0 (2)
		// m00*v0 + m01*v1 + m02*v2 (0)
		// m03*v3 (0)
		fxch		st(2)
		// m03*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 (0)
		// m10*v0 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (2)
		// m10*v0 (1)
		fld			dword ptr [ecx + 16 + 4]
		// m11
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (1)
		// m10*v0 (0)
		fmul		dword ptr [edx + 4]
		// m11*v1 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m10*v0 (0)
		fld			dword ptr [ecx + 16 + 8]
		// m12
		// m11*v1 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m10*v0 (0)
		fmul		dword ptr [edx + 8]
		// m12*v2 (2)
		// m11*v1 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m10*v0 (0)
		fxch		st(3)
		// m10*v0 (0)
		// m11*v1 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m12*v2 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m12*v2 (1)
		fld			dword ptr [ecx + 16 + 12]
		// m13
		// m10*v0 + m11*v1 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m12*v2 (0)
		fmul		dword ptr [edx + 12]
		// m13*v3 (2)
		// m10*v0 + m11*v1 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m12*v2 (0)
		fxch		st(3)
		// m12*v2 (0)
		// m10*v0 + m11*v1 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m13*v3 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 + m12*v2 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m13*v3 (1)
		fld			dword ptr [ecx + 32]
		// m20
		// m10*v0 + m11*v1 + m12*v2 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m13*v3 (0)
		fmul		dword ptr [edx]
		// m20*v0 (2)
		// m10*v0 + m11*v1 + m12*v2 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m13*v3 (0)
		fxch		st(3)
		// m13*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (1)
		fld			dword ptr [ecx + 32 + 4]
		// m21
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (0)
		fmul		dword ptr [edx + 4]
		// m21*v1 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (0)
		fld			dword ptr [ecx + 32 + 8]
		// m22
		// m21*v1 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (0)
		fmul		dword ptr [edx + 8]
		// m22*v2 (2)
		// m21*v1 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (0)
		fxch		st(4)
		// m20*v0 (0)
		// m21*v1 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m22*v2 (2)
		faddp		st(1), st
		// m20*v0 + m21*v1 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m22*v2 (1)
		fld			dword ptr [ecx + 32 + 12]
		// m23
		// m20*v0 + m21*v1 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m22*v2 (0)
		fmul		dword ptr [edx + 12]
		// m23*v3 (2)
		// m20*v0 + m21*v1 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m22*v2 (0)
		fxch		st(4)
		// m22*v2 (0)
		// m20*v0 + m21*v1 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m23*v3 (2)
		faddp		st(1), st
		// m20*v0 + m21*v1 + m22*v2 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m23*v3 (1)
		fxch		st(1)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m20*v0 + m21*v1 + m22*v2 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m23*v3 (1)
		fstp		dword ptr [eax + 4]
		// m20*v0 + m21*v1 + m22*v2 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m23*v3 (0)
		faddp		st(2), st
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (2)
		fstp		dword ptr [eax]
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		mov			dword ptr [eax + 12], 0x3f800000
		fstp		dword ptr [eax + 8]

		ret
	}
#endif
}

// assumes v[W] == 1.0f
__declspec(naked) void MatMultVec4x4_3(Vector res, Matrix m, Vector v)
{
#ifndef USE_ASM
	res[0] = m[0][0]*v[0] + m[0][1]*v[1] + m[0][2]*v[2] + m[0][3];
	res[1] = m[1][0]*v[0] + m[1][1]*v[1] + m[1][2]*v[2] + m[1][3];
	res[2] = m[2][0]*v[0] + m[2][1]*v[1] + m[2][2]*v[2] + m[2][3];
	res[3] = m[3][0]*v[0] + m[3][1]*v[1] + m[3][2]*v[2] + m[3][3];
#else
	__asm // 48 cycles
	{
		mov			eax, [esp + 4]
		mov			ecx, [esp + 8]

		mov			edx, [esp + 12]

		fld			dword ptr [ecx]
		// m00
		fmul		dword ptr [edx]
		// m00*v0 (2)
		fld			dword ptr [ecx + 4]
		// m01
		// m00*v0 (1)
		fmul		dword ptr [edx + 4]
		// m01*v1 (2)
		// m00*v0 (0)
		fxch		st(1)
		// m00*v0 (0)
		// m01*v1 (2)
		fadd		dword ptr [ecx + 12]
		// m00*v0 + m03 (2)
		// m01*v1 (1)
		fld			dword ptr [ecx + 8]
		// m02
		// m00*v0 + m03 (1)
		// m01*v1 (0)
		fmul		dword ptr [edx + 8]
		// m02*v2 (2)
		// m00*v0 + m03 (0)
		// m01*v1 (0)
		fxch		st(2)
		// m01*v1 (0)
		// m00*v0 + m03 (0)
		// m02*v2 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 + m03 (2)
		// m02*v2 (1)
		fld			dword ptr [ecx + 16]
		// m10
		// m00*v0 + m01*v1 + m03 (1)
		// m02*v2 (0)
		fmul		dword ptr [edx]
		// m10*v0 (2)
		// m00*v0 + m01*v1 + m03 (0)
		// m02*v2 (0)
		fxch		st(2)
		// m02*v2 (0)
		// m00*v0 + m01*v1 + m03 (0)
		// m10*v0 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 + m02*v2 + m03 (2)
		// m10*v0 (1)
		fld			dword ptr [ecx + 16 + 4]
		// m11
		// m00*v0 + m01*v1 + m02*v2 + m03 (1)
		// m10*v0 (0)
		fmul		dword ptr [edx + 4]
		// m11*v1 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m10*v0 (0)
		fxch		st(2)
		// m10*v0 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m11*v1 (2)
		fadd		dword ptr [ecx + 16 + 12]
		// m10*v0 + m13 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m11*v1 (1)
		fld			dword ptr [ecx + 16 + 8]
		// m12
		// m10*v0 + m13 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m11*v1 (0)
		fmul		dword ptr [edx + 8]
		// m12*v2 (2)
		// m10*v0 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m11*v1 (0)
		fxch		st(3)
		// m11*v1 (0)
		// m10*v0 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m12*v2 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 + m13 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m12*v2 (1)
		fld			dword ptr [ecx + 32]
		// m20
		// m10*v0 + m11*v1 + m13 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m12*v2 (0)
		fmul		dword ptr [edx]
		// m20*v0 (2)
		// m10*v0 + m11*v1 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m12*v2 (0)
		fxch		st(3)
		// m12*v2 (0)
		// m10*v0 + m11*v1 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m20*v0 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 + m12*v2 + m13 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m20*v0 (1)
		fld			dword ptr [ecx + 32 + 4]
		// m21
		// m10*v0 + m11*v1 + m12*v2 + m13 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m20*v0 (0)
		fmul		dword ptr [edx + 4]
		// m21*v1 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m20*v0 (0)
		fxch		st(3)
		// m20*v0 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m21*v1 (2)
		fadd		dword ptr [ecx + 32 + 12]
		// m20*v0 + m23 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m21*v1 (1)
		fld			dword ptr [ecx + 32 + 8]
		// m22
		// m20*v0 + m23 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m21*v1 (0)
		fmul		dword ptr [edx + 8]
		// m22*v2 (2)
		// m20*v0 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m21*v1 (0)
		fxch		st(4)
		// m21*v1 (0)
		// m20*v0 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m22*v2 (2)
		faddp		st(1), st
		// m20*v0 + m21*v1 + m23 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m22*v2 (1)
		fld			dword ptr [ecx + 48]
		// m30
		// m20*v0 + m21*v1 + m23 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m22*v2 (0)
		fmul		dword ptr [edx]
		// m30*v0 (2)
		// m20*v0 + m21*v1 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m22*v2 (0)
		fxch		st(4)
		// m22*v2 (0)
		// m20*v0 + m21*v1 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m30*v0 (2)
		faddp		st(1), st
		// m20*v0 + m21*v1 + m22*v2 + m23 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m30*v0 (1)
		fld			dword ptr [ecx + 48 + 4]
		// m31
		// m20*v0 + m21*v1 + m22*v2 + m23 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m30*v0 (0)
		fmul		dword ptr [edx + 4]
		// m31*v1 (2)
		// m20*v0 + m21*v1 + m22*v2 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m30*v0 (0)
		fxch		st(4)
		// m30*v0 (0)
		// m20*v0 + m21*v1 + m22*v2 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m31*v1 (2)
		fadd		dword ptr [ecx + 48 + 12]
		// m30*v0 + m33 (2)
		// m20*v0 + m21*v1 + m22*v2 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m31*v1 (1)
		fld			dword ptr [ecx + 48 + 8]
		// m32
		// m30*v0 + m33 (1)
		// m20*v0 + m21*v1 + m22*v2 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m31*v1 (0)
		fmul		dword ptr [edx + 8]
		// m32*v2 (2)
		// m30*v0 + m33 (0)
		// m20*v0 + m21*v1 + m22*v2 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m31*v1 (0)
		fxch		st(5)
		// m31*v1 (0)
		// m30*v0 + m33 (0)
		// m20*v0 + m21*v1 + m22*v2 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m32*v2 (2)
		faddp		st(1), st
		// m30*v0 + m31*v1 + m33 (2)
		// m20*v0 + m21*v1 + m22*v2 + m23 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m32*v2 (1)
		fxch		st(1)
		// m20*v0 + m21*v1 + m22*v2 + m23 (0)
		// m30*v0 + m31*v1 + m33 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m32*v2 (1)
		fstp		dword ptr [eax + 8]
		// m30*v0 + m31*v1 + m33 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m32*v2 (0)
		faddp		st(3), st
		// m10*v0 + m11*v1 + m12*v2 + m13 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m30*v0 + m31*v1 + m32*v2 + m33 (2)
		fstp		dword ptr [eax + 4]
		// m00*v0 + m01*v1 + m02*v2 + m03 (0)
		// m30*v0 + m31*v1 + m32*v2 + m33 (0)
		fstp		dword ptr [eax]
		// m30*v0 + m31*v1 + m32*v2 + m33 (0)
		fstp		dword ptr [eax + 12]

		ret
	}
#endif
}

__declspec(naked) void MatMultVec4x4_4(Vector res, Matrix m, Vector v)
{
#ifndef USE_ASM
	res[0] = m[0][0]*v[0] + m[0][1]*v[1] + m[0][2]*v[2] + m[0][3]*v[3];
	res[1] = m[1][0]*v[0] + m[1][1]*v[1] + m[1][2]*v[2] + m[1][3]*v[3];
	res[2] = m[2][0]*v[0] + m[2][1]*v[1] + m[2][2]*v[2] + m[2][3]*v[3];
	res[3] = m[3][0]*v[0] + m[3][1]*v[1] + m[3][2]*v[2] + m[3][3]*v[3];
#else
	__asm // 56 cycles
	{
		mov			eax, [esp + 4]  // eax = res
		mov			ecx, [esp + 8]  // ecx = m

		mov			edx, [esp + 12] // edx = v

		fld			dword ptr [ecx]
		// m00
		fmul		dword ptr [edx]
		// m00*v0 (2)
		fld			dword ptr [ecx + 4]
		// m01
		// m00*v0 (1)
		fmul		dword ptr [edx + 4]
		// m01*v1 (2)
		// m00*v0 (0)
		fld			dword ptr [ecx + 8]
		// m02
		// m01*v1 (1)
		// m00*v0 (0)
		fmul		dword ptr [edx + 8]
		// m02*v2 (2)
		// m01*v1 (0)
		// m00*v0 (0)
		fxch		st(2)
		// m00*v0 (0)
		// m01*v1 (0)
		// m02*v2 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 (2)
		// m02*v2 (1)
		fld			dword ptr [ecx + 12]
		// m03
		// m00*v0 + m01*v1 (1)
		// m02*v2 (0)
		fmul		dword ptr [edx + 12]
		// m03*v3 (2)
		// m00*v0 + m01*v1 (0)
		// m02*v2 (0)
		fxch		st(2)
		// m02*v2 (0)
		// m00*v0 + m01*v1 (0)
		// m03*v3 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 + m02*v2 (2)
		// m03*v3 (1)
		fld			dword ptr [ecx + 16]
		// m10
		// m00*v0 + m01*v1 + m02*v2 (1)
		// m03*v3 (0)
		fmul		dword ptr [edx]
		// m10*v0 (2)
		// m00*v0 + m01*v1 + m02*v2 (0)
		// m03*v3 (0)
		fxch		st(2)
		// m03*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 (0)
		// m10*v0 (2)
		faddp		st(1), st
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (2)
		// m10*v0 (1)
		fld			dword ptr [ecx + 16 + 4]
		// m11
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (1)
		// m10*v0 (0)
		fmul		dword ptr [edx + 4]
		// m11*v1 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m10*v0 (0)
		fld			dword ptr [ecx + 16 + 8]
		// m12
		// m11*v1 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m10*v0 (0)
		fmul		dword ptr [edx + 8]
		// m12*v2 (2)
		// m11*v1 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m10*v0 (0)
		fxch		st(3)
		// m10*v0 (0)
		// m11*v1 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m12*v2 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m12*v2 (1)
		fld			dword ptr [ecx + 16 + 12]
		// m13
		// m10*v0 + m11*v1 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m12*v2 (0)
		fmul		dword ptr [edx + 12]
		// m13*v3 (2)
		// m10*v0 + m11*v1 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m12*v2 (0)
		fxch		st(3)
		// m12*v2 (0)
		// m10*v0 + m11*v1 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m13*v3 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 + m12*v2 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m13*v3 (1)
		fld			dword ptr [ecx + 32]
		// m20
		// m10*v0 + m11*v1 + m12*v2 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m13*v3 (0)
		fmul		dword ptr [edx]
		// m20*v0 (2)
		// m10*v0 + m11*v1 + m12*v2 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m13*v3 (0)
		fxch		st(3)
		// m13*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (2)
		faddp		st(1), st
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (2)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (1)
		fld			dword ptr [ecx + 32 + 4]
		// m21
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (1)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (0)
		fmul		dword ptr [edx + 4]
		// m21*v1 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (0)
		fld			dword ptr [ecx + 32 + 8]
		// m22
		// m21*v1 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (0)
		fmul		dword ptr [edx + 8]
		// m22*v2 (2)
		// m21*v1 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m20*v0 (0)
		fxch		st(4)
		// m20*v0 (0)
		// m21*v1 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m22*v2 (2)
		faddp		st(1), st
		// m20*v0 + m21*v1 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m22*v2 (1)
		fld			dword ptr [ecx + 32 + 12]
		// m23
		// m20*v0 + m21*v1 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m22*v2 (0)
		fmul		dword ptr [edx + 12]
		// m23*v3 (2)
		// m20*v0 + m21*v1 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m22*v2 (0)
		fxch		st(4)
		// m22*v2 (0)
		// m20*v0 + m21*v1 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m23*v3 (2)
		faddp		st(1), st
		// m20*v0 + m21*v1 + m22*v2 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m23*v3 (1)
		fld			dword ptr [ecx + 48]
		// m30
		// m20*v0 + m21*v1 + m22*v2 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m23*v3 (0)
		fmul		dword ptr [edx]
		// m30*v0 (2)
		// m20*v0 + m21*v1 + m22*v2 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m23*v3 (0)
		fxch		st(4)
		// m23*v3 (0)
		// m20*v0 + m21*v1 + m22*v2 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m30*v0 (2)
		faddp		st(1), st
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m30*v0 (1)
		fld			dword ptr [ecx + 48 + 4]
		// m31
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (1)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m30*v0 (0)
		fmul		dword ptr [edx + 4]
		// m31*v1 (2)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m30*v0 (0)
		fld			dword ptr [ecx + 48 + 8]
		// m32
		// m31*v1 (1)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m30*v0 (0)
		fmul		dword ptr [edx + 8]
		// m32*v2 (2)
		// m31*v1 (0)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m30*v0 (0)
		fxch		st(5)
		// m30*v0 (0)
		// m31*v1 (0)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m32*v2 (2)
		faddp		st(1), st
		// m30*v0 + m31*v1 (2)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m32*v2 (1)
		fld			dword ptr [ecx + 48 + 12]
		// m33
		// m30*v0 + m31*v1 (1)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m32*v2 (0)
		fmul		dword ptr [edx + 12]
		// m33*v3 (2)
		// m30*v0 + m31*v1 (0)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m32*v2 (0)
		fxch		st(5)
		// m32*v2 (0)
		// m30*v0 + m31*v1 (0)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m33*v3 (2)
		faddp		st(1), st
		// m30*v0 + m31*v1 + m32*v2 (2)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m33*v3 (1)
		fxch		st(1)
		// m20*v0 + m21*v1 + m22*v2 + m23*v3 (0)
		// m30*v0 + m31*v1 + m32*v2 (2)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m33*v3 (1)
		fstp		dword ptr [eax + 8]
		// m30*v0 + m31*v1 + m32*v2 (0)
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m33*v3 (0)
		faddp		st(3), st
		// m10*v0 + m11*v1 + m12*v2 + m13*v3 (0)
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m30*v0 + m31*v1 + m32*v2 + m33*v3 (2)
		fstp		dword ptr [eax + 4]
		// m00*v0 + m01*v1 + m02*v2 + m03*v3 (0)
		// m30*v0 + m31*v1 + m32*v2 + m33*v3 (0)
		fstp		dword ptr [eax]
		// m30*v0 + m31*v1 + m32*v2 + m33*v3 (0)
		fstp		dword ptr [eax + 12]

		ret
	}
#endif
}

void IdentityMat(Matrix m)
{
	m[0][0] = 1.0f; m[0][1] = 0.0f; m[0][2] = 0.0f; m[0][3] = 0.0f;
	m[1][0] = 0.0f; m[1][1] = 1.0f; m[1][2] = 0.0f; m[1][3] = 0.0f;
	m[2][0] = 0.0f; m[2][1] = 0.0f; m[2][2] = 1.0f; m[2][3] = 0.0f;
	m[3][0] = 0.0f; m[3][1] = 0.0f; m[3][2] = 0.0f; m[3][3] = 1.0f;
}

void ScaleMat(Matrix m, float sx, float sy, float sz)
{
	m[0][0] = sx; m[0][1] = 0.0f; m[0][2] = 0.0f; m[0][3] = 0.0f;
	m[1][0] = 0.0f; m[1][1] = sy; m[1][2] = 0.0f; m[1][3] = 0.0f;
	m[2][0] = 0.0f; m[2][1] = 0.0f; m[2][2] = sz; m[2][3] = 0.0f;
	m[3][0] = 0.0f; m[3][1] = 0.0f; m[3][2] = 0.0f; m[3][3] = 1.0f;
}

void TranslateMat(Matrix m, float dx, float dy, float dz)
{
	m[0][0] = 1.0f; m[0][1] = 0.0f; m[0][2] = 0.0f; m[0][3] = dx;
	m[1][0] = 0.0f; m[1][1] = 1.0f; m[1][2] = 0.0f; m[1][3] = dy;
	m[2][0] = 0.0f; m[2][1] = 0.0f; m[2][2] = 1.0f; m[2][3] = dz;
	m[3][0] = 0.0f; m[3][1] = 0.0f; m[3][2] = 0.0f; m[3][3] = 1.0f;
}

void RotateXMat(Matrix m, float angle)
{
	float cos_angle, sin_angle;
	fsincos(angle, &sin_angle, &cos_angle);
	m[0][0] = 1.0f; m[0][1] = 0.0f; m[0][2] = 0.0f; m[0][3] = 0.0f;
	m[1][0] = 0.0f; m[1][1] = cos_angle; m[1][2] = -sin_angle; m[1][3] = 0.0f;
	m[2][0] = 0.0f; m[2][1] = sin_angle; m[2][2] = cos_angle; m[2][3] = 0.0f;
	m[3][0] = 0.0f; m[3][1] = 0.0f; m[3][2] = 0.0f; m[3][3] = 1.0f;
}

void RotateYMat(Matrix m, float angle)
{
	float cos_angle, sin_angle;
	fsincos(angle, &sin_angle, &cos_angle);
	m[0][0] = cos_angle; m[0][1] = 0.0f; m[0][2] = sin_angle; m[0][3] = 0.0f;
	m[1][0] = 0.0f; m[1][1] = 1.0f; m[1][2] = 0.0f; m[1][3] = 0.0f;
	m[2][0] = -sin_angle; m[2][1] = 0.0f; m[2][2] = cos_angle; m[2][3] = 0.0f;
	m[3][0] = 0.0f; m[3][1] = 0.0f; m[3][2] = 0.0f; m[3][3] = 1.0f;
}

void RotateZMat(Matrix m, float angle)
{
	float cos_angle, sin_angle;
	fsincos(angle, &sin_angle, &cos_angle);
	m[0][0] = cos_angle; m[0][1] = -sin_angle; m[0][2] = 0.0f; m[0][3] = 0.0f;
	m[1][0] = sin_angle; m[1][1] = cos_angle; m[1][2] = 0.0f; m[1][3] = 0.0f;
	m[2][0] = 0.0f; m[2][1] = 0.0f; m[2][2] = 1.0f; m[2][3] = 0.0f;
	m[3][0] = 0.0f; m[3][1] = 0.0f; m[3][2] = 0.0f; m[3][3] = 1.0f;
}

// returns a rotation matrix
// rotation about the axis (x, y, z) by angle
// the angle must be in radians
// the axis x, y, z must be normalized
void RotateMat(Matrix m, float angle, float x, float y, float z)
{
	Matrix ata, s;
	float sin_angle, cos_angle;

	fsincos(angle, &sin_angle, &cos_angle);

	if ((*(int *)&x) && !(*(int *)&y) && !(*(int *)&z))
	{
		// rotation by x-axis
		m[0][0] = 1.0f; m[0][1] =      0.0f; m[0][2] =       0.0f;
		m[1][0] = 0.0f; m[1][1] = cos_angle; m[1][2] = -sin_angle;
		m[2][0] = 0.0f; m[2][1] = sin_angle; m[2][2] =  cos_angle;
	}
	else if (!(*(int *)&x) && (*(int *)&y) && !(*(int *)&z))
	{
		// rotation by y-axis
		m[0][0] =  cos_angle; m[0][1] = 0.0f; m[0][2] = sin_angle;
		m[1][0] =       0.0f; m[1][1] = 1.0f; m[1][2] =      0.0f;
		m[2][0] = -sin_angle; m[2][1] = 0.0f; m[2][2] = cos_angle;
	}
	else if (!(*(int *)&x) && !(*(int *)&y) && (*(int *)&z))
	{
		// rotation by z-axis
		m[0][0] = cos_angle; m[0][1] = -sin_angle; m[0][2] = 0.0f;
		m[1][0] = sin_angle; m[1][1] =  cos_angle; m[1][2] = 0.0f;
		m[2][0] =      0.0f; m[2][1] =       0.0f; m[2][2] = 1.0f;
	}
	else
	{
		// ata = at * a (a is rotation axis; at is a transposed)
		ata[0][0] = x*x, ata[0][1] = y*x, ata[0][2] = z*x;
		ata[1][0] = x*y, ata[1][1] = y*y, ata[1][2] = z*y;
		ata[2][0] = x*z, ata[2][1] = y*z, ata[2][2] = z*z;

		s[0][0] = 0.0f; s[0][1] =   -z; s[0][2] =    y;
		s[1][0] =    z; s[1][1] = 0.0f; s[1][2] =   -x;
		s[2][0] =   -y; s[2][1] =    x; s[2][2] = 0.0f;

		// R = ata + cos_angle*(I-ata) + sin_angle*s
		m[0][0] = ata[0][0] + cos_angle*(1.0f-ata[0][0]) + sin_angle*s[0][0];
		m[0][1] = ata[0][1] + cos_angle*(    -ata[0][1]) + sin_angle*s[0][1];
		m[0][2] = ata[0][2] + cos_angle*(    -ata[0][2]) + sin_angle*s[0][2];

		m[1][0] = ata[1][0] + cos_angle*(    -ata[1][0]) + sin_angle*s[1][0];
		m[1][1] = ata[1][1] + cos_angle*(1.0f-ata[1][1]) + sin_angle*s[1][1];
		m[1][2] = ata[1][2] + cos_angle*(    -ata[1][2]) + sin_angle*s[1][2];

		m[2][0] = ata[2][0] + cos_angle*(    -ata[2][0]) + sin_angle*s[2][0];
		m[2][1] = ata[2][1] + cos_angle*(    -ata[2][1]) + sin_angle*s[2][1];
		m[2][2] = ata[2][2] + cos_angle*(1.0f-ata[2][2]) + sin_angle*s[2][2];
	}

	m[0][3] = 0.0f;
	m[1][3] = 0.0f;
	m[2][3] = 0.0f;
	m[3][0] = 0.0f; m[3][1] = 0.0f; m[3][2] = 0.0f; m[3][3] = 1.0f;
}

void ViewportMat(Matrix m, float min_x, float max_x, float min_y, float max_y, float min_z, float max_z)
{
	Matrix T, S_vp, T_vp;

	TranslateMat(T, 1.0f, 1.0f, 1.0f);
	ScaleMat(S_vp, 0.5f*(max_x - min_x), 0.5f*(max_y - min_y), 0.5f*(max_z - min_z));
	TranslateMat(T_vp, min_x, min_y, min_z);

	MatMultMat3x4(m, T_vp, S_vp);
	MatMultMat3x4(m, m, T);

	// flip y to compensate for upside down screen coords
	m[1][1] = -m[1][1];
}

void OrthoMat(Matrix m, float left_plane, float right_plane, float bottom_plane, float top_plane, float near_plane, float far_plane)
{
	m[0][0] = 2.0f/(right_plane - left_plane);
	m[0][1] = 0.0f;
	m[0][2] = 0.0f;
	m[0][3] = -(right_plane + left_plane)/(right_plane - left_plane);

	m[1][0] = 0.0f;
	m[1][1] = 2.0f/(top_plane - bottom_plane);
	m[1][2] = 0.0f;
	m[1][3] = -(top_plane + bottom_plane)/(top_plane - bottom_plane);

	m[2][0] = 0.0f;
	m[2][1] = 0.0f;
	m[2][2] = -2.0f/(far_plane - near_plane);
	m[2][3] = -(far_plane + near_plane)/(far_plane - near_plane);

	m[3][0] = 0.0f;
	m[3][1] = 0.0f;
	m[3][2] = 0.0f;
	m[3][3] = 1.0f;
}

void FrustumMat(Matrix m, float left_plane, float right_plane, float bottom_plane, float top_plane, float near_plane, float far_plane)
{
	m[0][0] = 2.0f*near_plane/(right_plane - left_plane);
	m[0][1] = 0.0f;
	m[0][2] = (right_plane + left_plane)/(right_plane - left_plane);
	m[0][3] = 0.0f;

	m[1][0] = 0.0f;
	m[1][1] = 2.0f*near_plane/(top_plane - bottom_plane);
	m[1][2] = (top_plane + bottom_plane)/(top_plane - bottom_plane);
	m[1][3] = 0.0f;

	m[2][0] = 0.0f;
	m[2][1] = 0.0f;
	m[2][2] = -(far_plane + near_plane)/(far_plane - near_plane);
	m[2][3] = -2.0f*far_plane*near_plane/(far_plane - near_plane);

	m[3][0] = 0.0f;
	m[3][1] = 0.0f;
	m[3][2] = -1.0f;
	m[3][3] = 0.0f;
}

void PerspectiveMat(Matrix m, float fovy, float aspect_ratio, float near_plane, float far_plane)
{
	float tan_angle;
	float half_width, half_height;

	tan_angle = (float)tan(0.5f*fovy);

	half_height = near_plane*tan_angle;
	half_width = half_height*aspect_ratio;

	FrustumMat(m, -half_width, half_width, -half_height, half_height, near_plane, far_plane);
}

void ParallelMat(Matrix m, float fovy, float aspect_ratio, float near_plane, float far_plane)
{
	float tan_angle;
	float half_width, half_height;

	tan_angle = (float)tan(0.5f*fovy);

	half_height = near_plane*tan_angle;
	half_width = half_height*aspect_ratio;

	OrthoMat(m, -half_width, half_width, -half_height, half_height, near_plane, far_plane);
}

#ifndef USE_ASM
void MatrixCopy(Matrix dst, Matrix src)
{
	dst[0][0] = src[0][0];
	dst[0][1] = src[0][1];
	dst[0][2] = src[0][2];
	dst[0][3] = src[0][3];
	dst[1][0] = src[1][0];
	dst[1][1] = src[1][1];
	dst[1][2] = src[1][2];
	dst[1][3] = src[1][3];
	dst[2][0] = src[2][0];
	dst[2][1] = src[2][1];
	dst[2][2] = src[2][2];
	dst[2][3] = src[2][3];
	dst[3][0] = src[3][0];
	dst[3][1] = src[3][1];
	dst[3][2] = src[3][2];
	dst[3][3] = src[3][3];
}
#else
__declspec(naked) void MatrixCopy(Matrix dst, Matrix src)
{
	__asm // 21 cycles
	{
		mov			edx, [esp + 4]
		mov			ecx, [esp + 8]

		// ******** 1 cycle delay ********
		// (waiting for ecx)
		mov			eax, [ecx]

		mov			[edx], eax
		mov			eax, [ecx + 4]

		mov			[edx + 4], eax
		mov			eax, [ecx + 8]

		mov			[edx + 8], eax
		mov			eax, [ecx + 12]

		mov			[edx + 12], eax
		mov			eax, [ecx + 16]

		mov			[edx + 16], eax
		mov			eax, [ecx + 20]

		mov			[edx + 20], eax
		mov			eax, [ecx + 24]

		mov			[edx + 24], eax
		mov			eax, [ecx + 28]

		mov			[edx + 28], eax
		mov			eax, [ecx + 32]

		mov			[edx + 32], eax
		mov			eax, [ecx + 36]

		mov			[edx + 36], eax
		mov			eax, [ecx + 40]

		mov			[edx + 40], eax
		mov			eax, [ecx + 44]

		mov			[edx + 44], eax
		mov			eax, [ecx + 48]

		mov			[edx + 48], eax
		mov			eax, [ecx + 52]

		mov			[edx + 52], eax
		mov			eax, [ecx + 56]

		mov			[edx + 56], eax
		mov			eax, [ecx + 60]

		mov			[edx + 60], eax

		ret
	}
}
#endif

#define DET_2x2(m00, m01, m10, m11) ( ((m00) * (m11)) - ((m01) * (m10)) )

#define DET_3x3(m00, m01, m02,  m10, m11, m12,  m20, m21, m22) ( (m00) * DET_2x2(m11, m12, m21, m22) - (m10) * DET_2x2(m01, m02, m21, m22) + (m20) * DET_2x2(m01, m02, m11, m12) )

#define DET_4x4(m00, m01, m02, m03,  m10, m11, m12, m13,  m20, m21, m22, m23,  m30, m31, m32, m33) ( (m00) * DET_3x3(m11, m12, m13, m21, m22, m23, m31, m32, m33) - (m10) * DET_3x3(m01, m02, m03, m21, m22, m23, m31, m32, m33) + (m20) * DET_3x3(m01, m02, m03, m11, m12, m13, m31, m32, m33) - (m30) * DET_3x3(m01, m02, m03, m11, m12, m13, m21, m22, m23) )

float Determinant3x4(Matrix m)
{
	return (m[0][0] * m[1][1] * m[2][2]) -
				 (m[0][2] * m[1][1] * m[2][0]);
}

float Determinant4x4(Matrix m)
{
  return (DET_4x4 (m[0][0], m[0][1], m[0][2], m[0][3],
		   m[1][0], m[1][1], m[1][2], m[1][3],
		   m[2][0], m[2][1], m[2][2], m[2][3],
		   m[3][0], m[3][1], m[3][2], m[3][3]));

  /*
  return ((m[0][0] * m[1][1] * m[2][2] * m[3][3]) +
					(m[0][1] * m[1][2] * m[2][3] * m[3][0]) +
					(m[0][2] * m[1][3] * m[2][0] * m[3][1]) +
					(m[0][3] * m[1][0] * m[2][1] * m[3][2])) -
				 ((m[0][3] * m[1][2] * m[2][1] * m[3][0]) +
				  (m[0][2] * m[1][1] * m[2][0] * m[3][3]) +
					(m[0][1] * m[1][0] * m[2][3] * m[3][2]) +
					(m[0][0] * m[1][3] * m[2][2] * m[3][1]));
  */
}

void AdjointMatrix3x4(Matrix adj_mat, Matrix mat)
{
	// each component of the adjoint matrix is computed by computing the sub 3x3 matrix determinants
	// note also that the adjoint is transposed
	adj_mat[0][0] =  ((mat[1][1] * mat[2][2]) -
										(mat[1][2] * mat[2][1]));
	adj_mat[1][0] = -((mat[1][0] * mat[2][2]) -
										(mat[1][2] * mat[2][0]));
	adj_mat[2][0] =  ((mat[1][0] * mat[2][1]) -
										(mat[1][1] * mat[2][0]));
	adj_mat[3][0] = 0.0f;

	adj_mat[0][1] = -((mat[0][1] * mat[2][2]) -
										(mat[0][2] * mat[2][1]));
	adj_mat[1][1] =  ((mat[0][0] * mat[2][2]) -
										(mat[0][2] * mat[2][0]));
	adj_mat[2][1] = -((mat[0][0] * mat[2][1]) -
										(mat[0][1] * mat[2][0]));
	adj_mat[3][1] = 0.0f;

	adj_mat[0][2] =  ((mat[0][1] * mat[1][2]) -
										(mat[0][2] * mat[1][1]));
	adj_mat[1][2] = -((mat[0][0] * mat[1][2]) -
										(mat[0][2] * mat[1][0]));
	adj_mat[2][2] =  ((mat[0][0] * mat[1][1]) -
										(mat[0][1] * mat[1][0]));
	adj_mat[3][2] = 0.0f;
}
void AdjointMatrix4x41(Matrix adj_mat, Matrix mat)
{
	/*
	//  00  01  02  03
	//  10  11  12  13
	//  20  21  22  23
	//  30  31  32  33

	// each component of the adjoint matrix is computed by computing the sub 3x3 matrix determinants
	// note also that the adjoint is transposed

	//  XX  XX  XX  XX
	//  XX  11  12  13
	//  XX  21  22  23
	//  XX  31  32  33
	adj_mat[0][0] =  ((m11 * m22 * m33) +
										(m12 * m23 * m31) +
										(m13 * m21 * m32)) -
									 ((m13 * m22 * m31) +
										(m12 * m21 * m33) +
										(m11 * m23 * m32));
	//  XX  XX  XX  XX
	//  10  XX  12  13
	//  20  XX  22  23
	//  30  XX  32  33
	*/
}
void AdjointMatrix4x4(Matrix adj_mat, Matrix mat)
{
	adj_mat[0][0] =  ((mat[1][1] * mat[2][2] * mat[3][3] +
										 mat[1][2] * mat[2][3] * mat[3][1] +
										 mat[1][3] * mat[2][1] * mat[3][2]) -
										(mat[1][3] * mat[2][2] * mat[3][1] +
										 mat[1][2] * mat[2][1] * mat[3][3] +
										 mat[1][1] * mat[2][3] * mat[3][2]));
	adj_mat[1][0] = -((mat[1][0] * mat[2][2] * mat[3][3] +
										 mat[1][2] * mat[2][3] * mat[3][0] +
										 mat[1][3] * mat[2][0] * mat[3][2]) -
										(mat[1][3] * mat[2][2] * mat[3][0] +
										 mat[1][2] * mat[2][0] * mat[3][3] +
										 mat[1][0] * mat[2][3] * mat[3][2]));
	adj_mat[2][0] =  ((mat[1][0] * mat[2][1] * mat[3][3] +
										 mat[1][1] * mat[2][3] * mat[3][0] +
										 mat[1][3] * mat[2][0] * mat[3][1]) -
										(mat[1][3] * mat[2][1] * mat[3][0] +
										 mat[1][1] * mat[2][0] * mat[3][3] +
										 mat[1][0] * mat[2][3] * mat[3][1]));
	adj_mat[3][0] = -((mat[1][0] * mat[2][1] * mat[3][2] +
										 mat[1][1] * mat[2][2] * mat[3][0] +
										 mat[1][2] * mat[2][0] * mat[3][1]) -
										(mat[1][2] * mat[2][1] * mat[3][0] +
										 mat[1][1] * mat[2][0] * mat[3][2] +
										 mat[1][0] * mat[2][2] * mat[3][1]));

	adj_mat[0][1] = -((mat[0][1] * mat[2][2] * mat[3][3] +
										 mat[0][2] * mat[2][3] * mat[3][1] +
										 mat[0][3] * mat[2][1] * mat[3][2]) -
										(mat[0][3] * mat[2][2] * mat[3][1] +
										 mat[0][2] * mat[2][1] * mat[3][3] +
										 mat[0][1] * mat[2][3] * mat[3][2]));
	adj_mat[1][1] =  ((mat[0][0] * mat[2][2] * mat[3][3] +
										 mat[0][2] * mat[2][3] * mat[3][0] +
										 mat[0][3] * mat[2][0] * mat[3][2]) -
										(mat[0][3] * mat[2][2] * mat[3][0] +
										 mat[0][2] * mat[2][0] * mat[3][3] +
										 mat[0][0] * mat[2][3] * mat[3][2]));
	adj_mat[2][1] = -((mat[0][0] * mat[2][1] * mat[3][3] +
										 mat[0][1] * mat[2][3] * mat[3][0] +
										 mat[0][3] * mat[2][0] * mat[3][1]) -
										(mat[0][3] * mat[2][1] * mat[3][0] +
										 mat[0][1] * mat[2][0] * mat[3][3] +
										 mat[0][0] * mat[2][3] * mat[3][1]));
	adj_mat[3][1] =  ((mat[0][0] * mat[2][1] * mat[3][2] +
										 mat[0][1] * mat[2][2] * mat[3][0] +
										 mat[0][2] * mat[2][0] * mat[3][1]) -
										(mat[0][2] * mat[2][1] * mat[3][0] +
										 mat[0][1] * mat[2][0] * mat[3][2] +
										 mat[0][0] * mat[2][2] * mat[3][1]));

	adj_mat[0][2] =  ((mat[0][1] * mat[1][2] * mat[3][3] +
										 mat[0][2] * mat[1][3] * mat[3][1] +
										 mat[0][3] * mat[1][1] * mat[3][2]) -
										(mat[0][3] * mat[1][2] * mat[3][1] +
										 mat[0][2] * mat[1][1] * mat[3][3] +
										 mat[0][1] * mat[1][3] * mat[3][2]));
	adj_mat[1][2] = -((mat[0][0] * mat[1][2] * mat[3][3] +
										 mat[0][2] * mat[1][3] * mat[3][0] +
										 mat[0][3] * mat[1][0] * mat[3][2]) -
										(mat[0][3] * mat[1][2] * mat[3][0] +
										 mat[0][2] * mat[1][0] * mat[3][3] +
										 mat[0][0] * mat[1][3] * mat[3][2]));
	adj_mat[2][2] =  ((mat[0][0] * mat[1][1] * mat[3][3] +
										 mat[0][1] * mat[1][3] * mat[3][0] +
										 mat[0][3] * mat[1][0] * mat[3][1]) -
										(mat[0][3] * mat[1][1] * mat[3][0] +
										 mat[0][1] * mat[1][0] * mat[3][3] +
										 mat[0][0] * mat[1][3] * mat[3][1]));
	adj_mat[3][2] = -((mat[0][0] * mat[1][1] * mat[3][2] +
										 mat[0][1] * mat[1][2] * mat[3][0] +
										 mat[0][2] * mat[1][0] * mat[3][1]) -
										(mat[0][2] * mat[1][1] * mat[3][0] +
										 mat[0][1] * mat[1][0] * mat[3][2] +
										 mat[0][0] * mat[1][2] * mat[3][1]));

	adj_mat[0][3] = -((mat[0][1] * mat[1][2] * mat[2][3] +
										 mat[0][2] * mat[1][3] * mat[2][1] +
										 mat[0][3] * mat[1][1] * mat[2][2]) -
										(mat[0][3] * mat[1][2] * mat[2][1] +
										 mat[0][2] * mat[1][1] * mat[2][3] +
										 mat[0][1] * mat[1][3] * mat[2][2]));
	adj_mat[1][3] =  ((mat[0][0] * mat[1][2] * mat[2][3] +
										 mat[0][2] * mat[1][3] * mat[2][0] +
										 mat[0][3] * mat[1][0] * mat[2][2]) -
										(mat[0][3] * mat[1][2] * mat[2][0] +
										 mat[0][2] * mat[1][0] * mat[2][3] +
										 mat[0][0] * mat[1][3] * mat[2][2]));
	adj_mat[2][3] = -((mat[0][0] * mat[1][1] * mat[2][3] +
										 mat[0][1] * mat[1][3] * mat[2][0] +
										 mat[0][3] * mat[1][0] * mat[2][1]) -
										(mat[0][3] * mat[1][1] * mat[2][0] +
										 mat[0][1] * mat[1][0] * mat[2][3] +
										 mat[0][0] * mat[1][3] * mat[2][1]));
	adj_mat[3][3] =  ((mat[0][0] * mat[1][1] * mat[2][2] +
										 mat[0][1] * mat[1][2] * mat[2][0] +
										 mat[0][2] * mat[1][0] * mat[2][1]) -
										(mat[0][2] * mat[1][1] * mat[2][0] +
										 mat[0][1] * mat[1][0] * mat[2][2] +
										 mat[0][0] * mat[1][2] * mat[2][1]));
}

// Cramer's rule
// lots of determinants, use the adjoint matrix, which
// is almost the inverse matrix, except scaled by the determinant
int InvertMatrix3x4Cramer(Matrix inv_mat, Matrix mat)
{
	float det, det_inv;

	det = Determinant3x4(mat);
	// if determinant is zero, this is an uninvertable matrix
	if (det == 0)
	{
	  return 0;
	}

	det_inv = 1.0f/det;

	AdjointMatrix3x4(inv_mat, mat);

	inv_mat[0][0] *= det_inv; inv_mat[0][1] *= det_inv; inv_mat[0][2] *= det_inv; inv_mat[0][3] *= det_inv;
	inv_mat[1][0] *= det_inv; inv_mat[1][1] *= det_inv; inv_mat[1][2] *= det_inv; inv_mat[1][3] *= det_inv;
	inv_mat[2][0] *= det_inv; inv_mat[2][1] *= det_inv; inv_mat[2][2] *= det_inv; inv_mat[2][3] *= det_inv;
	inv_mat[3][0]  =    0.0f; inv_mat[3][1]  =    0.0f; inv_mat[3][2]  =    0.0f; inv_mat[3][3]  =    1.0f;

	return 1;
}

int InvertMatrix4x4Cramer(Matrix inv_mat, Matrix mat)
{
	float det, det_inv;

	det = Determinant4x4(mat);
	// if determinant is zero, this is an uninvertable matrix
	if (det == 0)
	{
		return 0;
	}

	det_inv = 1.0f/det;

	AdjointMatrix4x4(inv_mat, mat);

	inv_mat[0][0] *= det_inv; inv_mat[0][1] *= det_inv; inv_mat[0][2] *= det_inv; inv_mat[0][3] *= det_inv;
	inv_mat[1][0] *= det_inv; inv_mat[1][1] *= det_inv; inv_mat[1][2] *= det_inv; inv_mat[1][3] *= det_inv;
	inv_mat[2][0] *= det_inv; inv_mat[2][1] *= det_inv; inv_mat[2][2] *= det_inv; inv_mat[2][3] *= det_inv;
	inv_mat[3][0] *= det_inv; inv_mat[3][1] *= det_inv; inv_mat[3][2] *= det_inv; inv_mat[3][3] *= det_inv;

	return 1;
}

int InvertMatrix4x4CramerPrecise(Matrix inv_mat, Matrix mat)
{
	double det;
	double adj_mat[4][4];

	det = (((double)mat[0][0] * (double)mat[1][1] * (double)mat[2][2] * (double)mat[3][3]) +
				 ((double)mat[0][1] * (double)mat[1][2] * (double)mat[2][3] * (double)mat[3][0]) +
				 ((double)mat[0][2] * (double)mat[1][3] * (double)mat[2][0] * (double)mat[3][1]) +
				 ((double)mat[0][3] * (double)mat[1][0] * (double)mat[2][1] * (double)mat[3][2])) -
				(((double)mat[0][3] * (double)mat[1][2] * (double)mat[2][1] * (double)mat[3][0]) +
				 ((double)mat[0][2] * (double)mat[1][1] * (double)mat[2][0] * (double)mat[3][3]) +
				 ((double)mat[0][1] * (double)mat[1][0] * (double)mat[2][3] * (double)mat[3][2]) +
				 ((double)mat[0][0] * (double)mat[1][3] * (double)mat[2][2] * (double)mat[3][1]));
	// if determinant is zero, this is an uninvertable matrix
	if (det == 0)
	{
		return 0;
	}

	// each component of the adjoint matrix is computed by computing the sub 3x3 matrix determinants
	// note also that the adjoint is transposed
	adj_mat[0][0] =  (((double)mat[1][1] * (double)mat[2][2] * (double)mat[3][3] + (double)mat[1][2] * (double)mat[2][3] * (double)mat[3][1] + (double)mat[1][3] * (double)mat[2][1] * (double)mat[3][2]) -
										((double)mat[1][3] * (double)mat[2][2] * (double)mat[3][1] + (double)mat[1][2] * (double)mat[2][1] * (double)mat[3][3] + (double)mat[1][1] * (double)mat[2][3] * (double)mat[3][2]));
	adj_mat[1][0] = -(((double)mat[1][0] * (double)mat[2][2] * (double)mat[3][3] + (double)mat[1][2] * (double)mat[2][3] * (double)mat[3][0] + (double)mat[1][3] * (double)mat[2][0] * (double)mat[3][2]) -
										((double)mat[1][3] * (double)mat[2][2] * (double)mat[3][0] + (double)mat[1][2] * (double)mat[2][0] * (double)mat[3][3] + (double)mat[1][0] * (double)mat[2][3] * (double)mat[3][2]));
	adj_mat[2][0] =  (((double)mat[1][0] * (double)mat[2][1] * (double)mat[3][3] + (double)mat[1][1] * (double)mat[2][3] * (double)mat[3][0] + (double)mat[1][3] * (double)mat[2][0] * (double)mat[3][1]) -
										((double)mat[1][3] * (double)mat[2][1] * (double)mat[3][0] + (double)mat[1][1] * (double)mat[2][0] * (double)mat[3][3] + (double)mat[1][0] * (double)mat[2][3] * (double)mat[3][1]));
	adj_mat[3][0] = -(((double)mat[1][0] * (double)mat[2][1] * (double)mat[3][2] + (double)mat[1][1] * (double)mat[2][2] * (double)mat[3][0] + (double)mat[1][2] * (double)mat[2][0] * (double)mat[3][1]) -
										((double)mat[1][2] * (double)mat[2][1] * (double)mat[3][0] + (double)mat[1][1] * (double)mat[2][0] * (double)mat[3][2] + (double)mat[1][0] * (double)mat[2][2] * (double)mat[3][1]));

	adj_mat[0][1] = -(((double)mat[0][1] * (double)mat[2][2] * (double)mat[3][3] + (double)mat[0][2] * (double)mat[2][3] * (double)mat[3][1] + (double)mat[0][3] * (double)mat[2][1] * (double)mat[3][2]) -
										((double)mat[0][3] * (double)mat[2][2] * (double)mat[3][1] + (double)mat[0][2] * (double)mat[2][1] * (double)mat[3][3] + (double)mat[0][1] * (double)mat[2][3] * (double)mat[3][2]));
	adj_mat[1][1] =  (((double)mat[0][0] * (double)mat[2][2] * (double)mat[3][3] + (double)mat[0][2] * (double)mat[2][3] * (double)mat[3][0] + (double)mat[0][3] * (double)mat[2][0] * (double)mat[3][2]) -
										((double)mat[0][3] * (double)mat[2][2] * (double)mat[3][0] + (double)mat[0][2] * (double)mat[2][0] * (double)mat[3][3] + (double)mat[0][0] * (double)mat[2][3] * (double)mat[3][2]));
	adj_mat[2][1] = -(((double)mat[0][0] * (double)mat[2][1] * (double)mat[3][3] + (double)mat[0][1] * (double)mat[2][3] * (double)mat[3][0] + (double)mat[0][3] * (double)mat[2][0] * (double)mat[3][1]) -
										((double)mat[0][3] * (double)mat[2][1] * (double)mat[3][0] + (double)mat[0][1] * (double)mat[2][0] * (double)mat[3][3] + (double)mat[0][0] * (double)mat[2][3] * (double)mat[3][1]));
	adj_mat[3][1] =  (((double)mat[0][0] * (double)mat[2][1] * (double)mat[3][2] + (double)mat[0][1] * (double)mat[2][2] * (double)mat[3][0] + (double)mat[0][2] * (double)mat[2][0] * (double)mat[3][1]) -
										((double)mat[0][2] * (double)mat[2][1] * (double)mat[3][0] + (double)mat[0][1] * (double)mat[2][0] * (double)mat[3][2] + (double)mat[0][0] * (double)mat[2][2] * (double)mat[3][1]));

	adj_mat[0][2] =  (((double)mat[0][1] * (double)mat[1][2] * (double)mat[3][3] + (double)mat[0][2] * (double)mat[1][3] * (double)mat[3][1] + (double)mat[0][3] * (double)mat[1][1] * (double)mat[3][2]) -
										((double)mat[0][3] * (double)mat[1][2] * (double)mat[3][1] + (double)mat[0][2] * (double)mat[1][1] * (double)mat[3][3] + (double)mat[0][1] * (double)mat[1][3] * (double)mat[3][2]));
	adj_mat[1][2] = -(((double)mat[0][0] * (double)mat[1][2] * (double)mat[3][3] + (double)mat[0][2] * (double)mat[1][3] * (double)mat[3][0] + (double)mat[0][3] * (double)mat[1][0] * (double)mat[3][2]) -
										((double)mat[0][3] * (double)mat[1][2] * (double)mat[3][0] + (double)mat[0][2] * (double)mat[1][0] * (double)mat[3][3] + (double)mat[0][0] * (double)mat[1][3] * (double)mat[3][2]));
	adj_mat[2][2] =  (((double)mat[0][0] * (double)mat[1][1] * (double)mat[3][3] + (double)mat[0][1] * (double)mat[1][3] * (double)mat[3][0] + (double)mat[0][3] * (double)mat[1][0] * (double)mat[3][1]) -
										((double)mat[0][3] * (double)mat[1][1] * (double)mat[3][0] + (double)mat[0][1] * (double)mat[1][0] * (double)mat[3][3] + (double)mat[0][0] * (double)mat[1][3] * (double)mat[3][1]));
	adj_mat[3][2] = -(((double)mat[0][0] * (double)mat[1][1] * (double)mat[3][2] + (double)mat[0][1] * (double)mat[1][2] * (double)mat[3][0] + (double)mat[0][2] * (double)mat[1][0] * (double)mat[3][1]) -
										((double)mat[0][2] * (double)mat[1][1] * (double)mat[3][0] + (double)mat[0][1] * (double)mat[1][0] * (double)mat[3][2] + (double)mat[0][0] * (double)mat[1][2] * (double)mat[3][1]));

	adj_mat[0][3] = -(((double)mat[0][1] * (double)mat[1][2] * (double)mat[2][3] + (double)mat[0][2] * (double)mat[1][3] * (double)mat[2][1] + (double)mat[0][3] * (double)mat[1][1] * (double)mat[2][2]) -
										((double)mat[0][3] * (double)mat[1][2] * (double)mat[2][1] + (double)mat[0][2] * (double)mat[1][1] * (double)mat[2][3] + (double)mat[0][1] * (double)mat[1][3] * (double)mat[2][2]));
	adj_mat[1][3] =  (((double)mat[0][0] * (double)mat[1][2] * (double)mat[2][3] + (double)mat[0][2] * (double)mat[1][3] * (double)mat[2][0] + (double)mat[0][3] * (double)mat[1][0] * (double)mat[2][2]) -
										((double)mat[0][3] * (double)mat[1][2] * (double)mat[2][0] + (double)mat[0][2] * (double)mat[1][0] * (double)mat[2][3] + (double)mat[0][0] * (double)mat[1][3] * (double)mat[2][2]));
	adj_mat[2][3] = -(((double)mat[0][0] * (double)mat[1][1] * (double)mat[2][3] + (double)mat[0][1] * (double)mat[1][3] * (double)mat[2][0] + (double)mat[0][3] * (double)mat[1][0] * (double)mat[2][1]) -
										((double)mat[0][3] * (double)mat[1][1] * (double)mat[2][0] + (double)mat[0][1] * (double)mat[1][0] * (double)mat[2][3] + (double)mat[0][0] * (double)mat[1][3] * (double)mat[2][1]));
	adj_mat[3][3] =  (((double)mat[0][0] * (double)mat[1][1] * (double)mat[2][2] + (double)mat[0][1] * (double)mat[1][2] * (double)mat[2][0] + (double)mat[0][2] * (double)mat[1][0] * (double)mat[2][1]) -
										((double)mat[0][2] * (double)mat[1][1] * (double)mat[2][0] + (double)mat[0][1] * (double)mat[1][0] * (double)mat[2][2] + (double)mat[0][0] * (double)mat[1][2] * (double)mat[2][1]));

	inv_mat[0][0] = (float)(adj_mat[0][0]/det); inv_mat[0][1] = (float)(adj_mat[0][1]/det); inv_mat[0][2] = (float)(adj_mat[0][2]/det); inv_mat[0][3] = (float)(adj_mat[0][3]/det);
	inv_mat[1][0] = (float)(adj_mat[1][0]/det); inv_mat[1][1] = (float)(adj_mat[1][1]/det); inv_mat[1][2] = (float)(adj_mat[1][2]/det); inv_mat[1][3] = (float)(adj_mat[1][3]/det);
	inv_mat[2][0] = (float)(adj_mat[2][0]/det); inv_mat[2][1] = (float)(adj_mat[2][1]/det); inv_mat[2][2] = (float)(adj_mat[2][2]/det); inv_mat[2][3] = (float)(adj_mat[2][3]/det);
	inv_mat[3][0] = (float)(adj_mat[3][0]/det); inv_mat[3][1] = (float)(adj_mat[3][1]/det); inv_mat[3][2] = (float)(adj_mat[3][2]/det); inv_mat[3][3] = (float)(adj_mat[3][3]/det);

	return 1;
}

// Gaussian elimination
// apply row+column operations to reduce
// the matrix to the identity matrix, applying each
// operation also to the identity matrix
// in the end, the matrix will be reduced
// to the identity matrix, and the identity matrix
// will be transformed to the inverse matrix
int InvertMatrix3x4Gaussian(Matrix inv_mat, Matrix mat)
{
	float *row[4], *inv_row[4], *tmp_row;
	Matrix cur_mat, cur_inv_mat;
	float det, scale;
	int i;

	det = Determinant3x4(mat);
	// if determinant is zero, this is an uninvertable matrix
	if (det == 0)
	{
		return 0;
	}

	MatrixCopy(cur_mat, mat);
	IdentityMat(cur_inv_mat);

	// keep pointers to the rows so that we can easily swap rows
	row[0] = &cur_mat[0][0];
	row[1] = &cur_mat[1][0];
	row[2] = &cur_mat[2][0];
	row[3] = &cur_mat[3][0];

	inv_row[0] = &cur_inv_mat[0][0];
	inv_row[1] = &cur_inv_mat[1][0];
	inv_row[2] = &cur_inv_mat[2][0];
	inv_row[3] = &cur_inv_mat[3][0];

	//  X X X X
	//  X X X X
	//  X X X X
	//  0 0 0 1
	if (row[0][0] == 0)
	{
		for (i=1; i<3; i++)
		{
			if (row[i][0] != 0)
			{
				// swap
				tmp_row = row[i];
				row[i] = row[0];
				row[0] = tmp_row;
				tmp_row = inv_row[i];
				inv_row[i] = inv_row[0];
				inv_row[0] = tmp_row;
				break;
			}
		}
	}
	//  X X X X
	//  X X X X
	//  X X X X
	//  0 0 0 1
	scale = 1.0f/row[0][0];
	row[0][0] = 1.0f;
	row[0][1] *= scale;
	row[0][2] *= scale;
	row[0][3] *= scale;
	inv_row[0][0] *= scale;
	inv_row[0][1] *= scale;
	inv_row[0][2] *= scale;
	inv_row[0][3] *= scale;
	//  1 X X X
	//  X X X X
	//  X X X X
	//  0 0 0 1
	for (i=1; i<3; i++)
	{
		if (row[i][0] != 0)
		{
			scale = row[i][0];

			row[i][0] = 0.0f;
			row[i][1] -= scale*row[0][1];
			row[i][2] -= scale*row[0][2];
			row[i][3] -= scale*row[0][3];
			inv_row[i][0] -= scale*inv_row[0][0];
			inv_row[i][1] -= scale*inv_row[0][1];
			inv_row[i][2] -= scale*inv_row[0][2];
			inv_row[i][3] -= scale*inv_row[0][3];
		}
	}
	//  1 X X X
	//  0 X X X
	//  0 X X X
	//  0 0 0 1
	if (row[1][1] == 0)
	{
		// swap
		tmp_row = row[2];
		row[2] = row[1];
		row[1] = tmp_row;
		tmp_row = inv_row[2];
		inv_row[2] = inv_row[1];
		inv_row[1] = tmp_row;
	}
	//  1 X X X
	//  0 X X X
	//  0 X X X
	//  0 0 0 1
	scale = 1.0f/row[1][1];
	row[1][1] = 1.0f;
	row[1][2] *= scale;
	row[1][3] *= scale;
	inv_row[1][0] *= scale;
	inv_row[1][1] *= scale;
	inv_row[1][2] *= scale;
	inv_row[1][3] *= scale;
	//  1 X X X
	//  0 1 X X
	//  0 X X X
	//  0 0 0 1
	if (row[2][1] != 0)
	{
		scale = row[2][1];

		row[2][1] = 0.0f;
		row[2][2] -= scale*row[1][2];
		row[2][3] -= scale*row[1][3];
		inv_row[2][0] -= scale*inv_row[1][0];
		inv_row[2][1] -= scale*inv_row[1][1];
		inv_row[2][2] -= scale*inv_row[1][2];
		inv_row[2][3] -= scale*inv_row[1][3];
	}
	//  1 X X X
	//  0 1 X X
	//  0 0 X X
	//  0 0 0 1
	scale = 1.0f/row[2][2];
	row[2][2] = 1.0f;
	row[2][3] *= scale;
	inv_row[2][0] *= scale;
	inv_row[2][1] *= scale;
	inv_row[2][2] *= scale;
	inv_row[2][3] *= scale;
	//  1 X X X
	//  0 1 X X
	//  0 0 1 X
	//  0 0 0 1
	for (i=0; i<3; i++)
	{
		if (row[i][3] != 0)
		{
			scale = row[i][3];

			row[i][0] -= scale*row[3][0];
			row[i][1] -= scale*row[3][1];
			row[i][2] -= scale*row[3][2];
			row[i][3] = 0.0f;
			inv_row[i][0] -= scale*inv_row[3][0];
			inv_row[i][1] -= scale*inv_row[3][1];
			inv_row[i][2] -= scale*inv_row[3][2];
			inv_row[i][3] -= scale*inv_row[3][3];
		}
	}
	//  1 X X 0
	//  0 1 X 0
	//  0 0 1 0
	//  0 0 0 1
	for (i=0; i<2; i++)
	{
		if (row[i][2] != 0)
		{
			scale = row[i][2];

			row[i][0] -= scale*row[2][0];
			row[i][1] -= scale*row[2][1];
			row[i][2] = 0.0f;
			inv_row[i][0] -= scale*inv_row[2][0];
			inv_row[i][1] -= scale*inv_row[2][1];
			inv_row[i][2] -= scale*inv_row[2][2];
			inv_row[i][3] -= scale*inv_row[2][3];
		}
	}
	//  1 X 0 0
	//  0 1 0 0
	//  0 0 1 0
	//  0 0 0 1
	if (row[0][1] != 0)
	{
		scale = row[0][1];

		row[0][1] = 0.0f;
		row[0][0] -= scale*row[1][0];
		inv_row[0][0] -= scale*inv_row[1][0];
		inv_row[0][1] -= scale*inv_row[1][1];
		inv_row[0][2] -= scale*inv_row[1][2];
		inv_row[0][3] -= scale*inv_row[1][3];
	}
	//  1 0 0 0
	//  0 1 0 0
	//  0 0 1 0
	//  0 0 0 1

	// now we're done, we've reduced the original matrix to the identity matrix
	// and created the inverse matrix in our original identity matrix
	inv_mat[0][0] = inv_row[0][0];
	inv_mat[0][1] = inv_row[0][1];
	inv_mat[0][2] = inv_row[0][2];
	inv_mat[0][3] = inv_row[0][3];

	inv_mat[1][0] = inv_row[1][0];
	inv_mat[1][1] = inv_row[1][1];
	inv_mat[1][2] = inv_row[1][2];
	inv_mat[1][3] = inv_row[1][3];

	inv_mat[2][0] = inv_row[2][0];
	inv_mat[2][1] = inv_row[2][1];
	inv_mat[2][2] = inv_row[2][2];
	inv_mat[2][3] = inv_row[2][3];

	inv_mat[3][0] = inv_row[3][0];
	inv_mat[3][1] = inv_row[3][1];
	inv_mat[3][2] = inv_row[3][2];
	inv_mat[3][3] = inv_row[3][3];

	return 1;
}

int InvertMatrix4x4Gaussian(Matrix inv_mat, Matrix mat)
{
	float *row[4], *inv_row[4], *tmp_row;
	Matrix cur_mat, cur_inv_mat;
	float det, scale;
	int i;

	det = Determinant4x4(mat);
	// if determinant is zero, this is an uninvertable matrix
	if (det == 0)
	{
		return 0;
	}

	MatrixCopy(cur_mat, mat);
	IdentityMat(cur_inv_mat);

	// keep pointers to the rows so that we can easily swap rows
	row[0] = &cur_mat[0][0];
	row[1] = &cur_mat[1][0];
	row[2] = &cur_mat[2][0];
	row[3] = &cur_mat[3][0];

	inv_row[0] = &cur_inv_mat[0][0];
	inv_row[1] = &cur_inv_mat[1][0];
	inv_row[2] = &cur_inv_mat[2][0];
	inv_row[3] = &cur_inv_mat[3][0];

	//  X X X X
	//  X X X X
	//  X X X X
	//  X X X X
	if (row[0][0] == 0)
	{
		for (i=1; i<4; i++)
		{
			if (row[i][0] != 0)
			{
				// swap
				tmp_row = row[i];
				row[i] = row[0];
				row[0] = tmp_row;
				tmp_row = inv_row[i];
				inv_row[i] = inv_row[0];
				inv_row[0] = tmp_row;
				break;
			}
		}
	}
	//  X X X X
	//  X X X X
	//  X X X X
	//  X X X X
	scale = 1.0f/row[0][0];
	row[0][0] = 1.0f;
	row[0][1] *= scale;
	row[0][2] *= scale;
	row[0][3] *= scale;
	inv_row[0][0] *= scale;
	inv_row[0][1] *= scale;
	inv_row[0][2] *= scale;
	inv_row[0][3] *= scale;
	//  1 X X X
	//  X X X X
	//  X X X X
	//  X X X X
	for (i=1; i<4; i++)
	{
		if (row[i][0] != 0)
		{
			scale = row[i][0];

			row[i][0] = 0.0f;
			row[i][1] -= scale*row[0][1];
			row[i][2] -= scale*row[0][2];
			row[i][3] -= scale*row[0][3];
			inv_row[i][0] -= scale*inv_row[0][0];
			inv_row[i][1] -= scale*inv_row[0][1];
			inv_row[i][2] -= scale*inv_row[0][2];
			inv_row[i][3] -= scale*inv_row[0][3];
		}
	}
	//  1 X X X
	//  0 X X X
	//  0 X X X
	//  0 X X X
	if (row[1][1] == 0)
	{
		for (i=2; i<4; i++)
		{
			if (row[i][1] != 0)
			{
				// swap
				tmp_row = row[i];
				row[i] = row[1];
				row[1] = tmp_row;
				tmp_row = inv_row[i];
				inv_row[i] = inv_row[1];
				inv_row[1] = tmp_row;
				break;
			}
		}
	}
	//  1 X X X
	//  0 X X X
	//  0 X X X
	//  0 X X X
	scale = 1.0f/row[1][1];
	row[1][1] = 1.0f;
	row[1][2] *= scale;
	row[1][3] *= scale;
	inv_row[1][0] *= scale;
	inv_row[1][1] *= scale;
	inv_row[1][2] *= scale;
	inv_row[1][3] *= scale;
	//  1 X X X
	//  0 1 X X
	//  0 X X X
	//  0 X X X
	for (i=2; i<4; i++)
	{
		if (row[i][1] != 0)
		{
			scale = row[i][1];

			row[i][1] = 0.0f;
			row[i][2] -= scale*row[1][2];
			row[i][3] -= scale*row[1][3];
			inv_row[i][0] -= scale*inv_row[1][0];
			inv_row[i][1] -= scale*inv_row[1][1];
			inv_row[i][2] -= scale*inv_row[1][2];
			inv_row[i][3] -= scale*inv_row[1][3];
		}
	}
	//  1 X X X
	//  0 1 X X
	//  0 0 X X
	//  0 0 X X
	if (row[2][2] == 0)
	{
		tmp_row = row[2];
		row[2] = row[3];
		row[3] = tmp_row;
		tmp_row = inv_row[2];
		inv_row[2] = inv_row[3];
		inv_row[3] = tmp_row;
	}
	//  1 X X X
	//  0 1 X X
	//  0 0 X X
	//  0 0 X X
	scale = 1.0f/row[2][2];
	row[2][2] = 1.0f;
	row[2][3] *= scale;
	inv_row[2][0] *= scale;
	inv_row[2][1] *= scale;
	inv_row[2][2] *= scale;
	inv_row[2][3] *= scale;
	//  1 X X X
	//  0 1 X X
	//  0 0 1 X
	//  0 0 X X
	if (row[3][2] != 0)
	{
		scale = row[3][2];

		row[3][2] = 0.0f;
		row[3][3] -= scale*row[2][3];
		inv_row[3][0] -= scale*inv_row[2][0];
		inv_row[3][1] -= scale*inv_row[2][1];
		inv_row[3][2] -= scale*inv_row[2][2];
		inv_row[3][3] -= scale*inv_row[2][3];
	}
	//  1 X X X
	//  0 1 X X
	//  0 0 1 X
	//  0 0 0 X
	scale = 1.0f/row[3][3];
	row[3][3] = 1.0f;
	inv_row[3][0] *= scale;
	inv_row[3][1] *= scale;
	inv_row[3][2] *= scale;
	inv_row[3][3] *= scale;
	//  1 X X X
	//  0 1 X X
	//  0 0 1 X
	//  0 0 0 1
	for (i=0; i<3; i++)
	{
		if (row[i][3] != 0)
		{
			scale = row[i][3];

			row[i][0] -= scale*row[3][0];
			row[i][1] -= scale*row[3][1];
			row[i][2] -= scale*row[3][2];
			row[i][3] = 0.0f;
			inv_row[i][0] -= scale*inv_row[3][0];
			inv_row[i][1] -= scale*inv_row[3][1];
			inv_row[i][2] -= scale*inv_row[3][2];
			inv_row[i][3] -= scale*inv_row[3][3];
		}
	}
	//  1 X X 0
	//  0 1 X 0
	//  0 0 1 0
	//  0 0 0 1
	for (i=0; i<2; i++)
	{
		if (row[i][2] != 0)
		{
			scale = row[i][2];

			row[i][0] -= scale*row[2][0];
			row[i][1] -= scale*row[2][1];
			row[i][2] = 0.0f;
			inv_row[i][0] -= scale*inv_row[2][0];
			inv_row[i][1] -= scale*inv_row[2][1];
			inv_row[i][2] -= scale*inv_row[2][2];
			inv_row[i][3] -= scale*inv_row[2][3];
		}
	}
	//  1 X 0 0
	//  0 1 0 0
	//  0 0 1 0
	//  0 0 0 1
	if (row[0][1] != 0)
	{
		scale = row[0][1];

		row[0][1] = 0.0f;
		row[0][0] -= scale*row[1][0];
		inv_row[0][0] -= scale*inv_row[1][0];
		inv_row[0][1] -= scale*inv_row[1][1];
		inv_row[0][2] -= scale*inv_row[1][2];
		inv_row[0][3] -= scale*inv_row[1][3];
	}
	//  1 0 0 0
	//  0 1 0 0
	//  0 0 1 0
	//  0 0 0 1

	// now we're done, we've reduced the original matrix to the identity matrix
	// and created the inverse matrix in our original identity matrix
	inv_mat[0][0] = inv_row[0][0];
	inv_mat[0][1] = inv_row[0][1];
	inv_mat[0][2] = inv_row[0][2];
	inv_mat[0][3] = inv_row[0][3];

	inv_mat[1][0] = inv_row[1][0];
	inv_mat[1][1] = inv_row[1][1];
	inv_mat[1][2] = inv_row[1][2];
	inv_mat[1][3] = inv_row[1][3];

	inv_mat[2][0] = inv_row[2][0];
	inv_mat[2][1] = inv_row[2][1];
	inv_mat[2][2] = inv_row[2][2];
	inv_mat[2][3] = inv_row[2][3];

	inv_mat[3][0] = inv_row[3][0];
	inv_mat[3][1] = inv_row[3][1];
	inv_mat[3][2] = inv_row[3][2];
	inv_mat[3][3] = inv_row[3][3];

	return 1;
}

int InvertMatrix4x4GaussianPrecise(Matrix inv_mat, Matrix mat)
{
	double *row[4], *inv_row[4], *tmp_row;
	double cur_mat[4][4], cur_inv_mat[4][4];
	double det, scale;
	int i, j;

	det = (((double)mat[0][0] * (double)mat[1][1] * (double)mat[2][2] * (double)mat[3][3]) +
				 ((double)mat[0][1] * (double)mat[1][2] * (double)mat[2][3] * (double)mat[3][0]) +
				 ((double)mat[0][2] * (double)mat[1][3] * (double)mat[2][0] * (double)mat[3][1]) +
				 ((double)mat[0][3] * (double)mat[1][0] * (double)mat[2][1] * (double)mat[3][2])) -
				(((double)mat[0][3] * (double)mat[1][2] * (double)mat[2][1] * (double)mat[3][0]) +
				 ((double)mat[0][2] * (double)mat[1][1] * (double)mat[2][0] * (double)mat[3][3]) +
				 ((double)mat[0][1] * (double)mat[1][0] * (double)mat[2][3] * (double)mat[3][2]) +
				 ((double)mat[0][0] * (double)mat[1][3] * (double)mat[2][2] * (double)mat[3][1]));
	// if determinant is zero, this is an uninvertable matrix
	if (det == 0)
	{
		return 0;
	}

//	MatrixCopy(cur_mat, mat);
	for (i=0; i<4; i++)
		for (j=0; j<4; j++)
			cur_mat[i][j] = (double)mat[i][j];

//	IdentityMat(cur_inv_mat);
	for (i=0; i<4; i++)
		for (j=0; j<4; j++)
			if (i == j)
				cur_inv_mat[i][j] = 1.0;
			else
				cur_inv_mat[i][j] = 0.0;

	// keep pointers to the rows so that we can easily swap rows
	row[0] = &cur_mat[0][0];
	row[1] = &cur_mat[1][0];
	row[2] = &cur_mat[2][0];
	row[3] = &cur_mat[3][0];

	inv_row[0] = &cur_inv_mat[0][0];
	inv_row[1] = &cur_inv_mat[1][0];
	inv_row[2] = &cur_inv_mat[2][0];
	inv_row[3] = &cur_inv_mat[3][0];

	//  X X X X
	//  X X X X
	//  X X X X
	//  X X X X
	if (row[0][0] == 0)
	{
		for (i=1; i<4; i++)
		{
			if (row[i][0] != 0)
			{
				// swap
				tmp_row = row[i];
				row[i] = row[0];
				row[0] = tmp_row;
				tmp_row = inv_row[i];
				inv_row[i] = inv_row[0];
				inv_row[0] = tmp_row;
				break;
			}
		}
	}
	//  X X X X
	//  X X X X
	//  X X X X
	//  X X X X
	scale = 1.0/row[0][0];
	row[0][0] = 1.0;
	row[0][1] *= scale;
	row[0][2] *= scale;
	row[0][3] *= scale;
	inv_row[0][0] *= scale;
	inv_row[0][1] *= scale;
	inv_row[0][2] *= scale;
	inv_row[0][3] *= scale;
	//  1 X X X
	//  X X X X
	//  X X X X
	//  X X X X
	for (i=1; i<4; i++)
	{
		if (row[i][0] != 0)
		{
			scale = row[i][0];

			row[i][0] = 0.0;
			row[i][1] -= scale*row[0][1];
			row[i][2] -= scale*row[0][2];
			row[i][3] -= scale*row[0][3];
			inv_row[i][0] -= scale*inv_row[0][0];
			inv_row[i][1] -= scale*inv_row[0][1];
			inv_row[i][2] -= scale*inv_row[0][2];
			inv_row[i][3] -= scale*inv_row[0][3];
		}
	}
	//  1 X X X
	//  0 X X X
	//  0 X X X
	//  0 X X X
	if (row[1][1] == 0)
	{
		for (i=2; i<4; i++)
		{
			if (row[i][1] != 0)
			{
				// swap
				tmp_row = row[i];
				row[i] = row[1];
				row[1] = tmp_row;
				tmp_row = inv_row[i];
				inv_row[i] = inv_row[1];
				inv_row[1] = tmp_row;
				break;
			}
		}
	}
	//  1 X X X
	//  0 X X X
	//  0 X X X
	//  0 X X X
	scale = 1.0/row[1][1];
	row[1][1] = 1.0;
	row[1][2] *= scale;
	row[1][3] *= scale;
	inv_row[1][0] *= scale;
	inv_row[1][1] *= scale;
	inv_row[1][2] *= scale;
	inv_row[1][3] *= scale;
	//  1 X X X
	//  0 1 X X
	//  0 X X X
	//  0 X X X
	for (i=2; i<4; i++)
	{
		if (row[i][1] != 0)
		{
			scale = row[i][1];

			row[i][1] = 0.0;
			row[i][2] -= scale*row[1][2];
			row[i][3] -= scale*row[1][3];
			inv_row[i][0] -= scale*inv_row[1][0];
			inv_row[i][1] -= scale*inv_row[1][1];
			inv_row[i][2] -= scale*inv_row[1][2];
			inv_row[i][3] -= scale*inv_row[1][3];
		}
	}
	//  1 X X X
	//  0 1 X X
	//  0 0 X X
	//  0 0 X X
	if (row[2][2] == 0)
	{
		tmp_row = row[2];
		row[2] = row[3];
		row[3] = tmp_row;
		tmp_row = inv_row[2];
		inv_row[2] = inv_row[3];
		inv_row[3] = tmp_row;
	}
	//  1 X X X
	//  0 1 X X
	//  0 0 X X
	//  0 0 X X
	scale = 1.0/row[2][2];
	row[2][2] = 1.0;
	row[2][3] *= scale;
	inv_row[2][0] *= scale;
	inv_row[2][1] *= scale;
	inv_row[2][2] *= scale;
	inv_row[2][3] *= scale;
	//  1 X X X
	//  0 1 X X
	//  0 0 1 X
	//  0 0 X X
	if (row[3][2] != 0)
	{
		scale = row[3][2];

		row[3][2] = 0.0;
		row[3][3] -= scale*row[2][3];
		inv_row[3][0] -= scale*inv_row[2][0];
		inv_row[3][1] -= scale*inv_row[2][1];
		inv_row[3][2] -= scale*inv_row[2][2];
		inv_row[3][3] -= scale*inv_row[2][3];
	}
	//  1 X X X
	//  0 1 X X
	//  0 0 1 X
	//  0 0 0 X
	scale = 1.0/row[3][3];
	row[3][3] = 1.0;
	inv_row[3][0] *= scale;
	inv_row[3][1] *= scale;
	inv_row[3][2] *= scale;
	inv_row[3][3] *= scale;
	//  1 X X X
	//  0 1 X X
	//  0 0 1 X
	//  0 0 0 1
	for (i=0; i<3; i++)
	{
		if (row[i][3] != 0)
		{
			scale = row[i][3];

			row[i][0] -= scale*row[3][0];
			row[i][1] -= scale*row[3][1];
			row[i][2] -= scale*row[3][2];
			row[i][3] = 0.0;
			inv_row[i][0] -= scale*inv_row[3][0];
			inv_row[i][1] -= scale*inv_row[3][1];
			inv_row[i][2] -= scale*inv_row[3][2];
			inv_row[i][3] -= scale*inv_row[3][3];
		}
	}
	//  1 X X 0
	//  0 1 X 0
	//  0 0 1 0
	//  0 0 0 1
	for (i=0; i<2; i++)
	{
		if (row[i][2] != 0)
		{
			scale = row[i][2];

			row[i][0] -= scale*row[2][0];
			row[i][1] -= scale*row[2][1];
			row[i][2] = 0.0;
			inv_row[i][0] -= scale*inv_row[2][0];
			inv_row[i][1] -= scale*inv_row[2][1];
			inv_row[i][2] -= scale*inv_row[2][2];
			inv_row[i][3] -= scale*inv_row[2][3];
		}
	}
	//  1 X 0 0
	//  0 1 0 0
	//  0 0 1 0
	//  0 0 0 1
	if (row[0][1] != 0)
	{
		scale = row[0][1];

		row[0][1] = 0.0;
		row[0][0] -= scale*row[1][0];
		inv_row[0][0] -= scale*inv_row[1][0];
		inv_row[0][1] -= scale*inv_row[1][1];
		inv_row[0][2] -= scale*inv_row[1][2];
		inv_row[0][3] -= scale*inv_row[1][3];
	}
	//  1 0 0 0
	//  0 1 0 0
	//  0 0 1 0
	//  0 0 0 1

	// now we're done, we've reduced the original matrix to the identity matrix
	// and created the inverse matrix in our original identity matrix
	inv_mat[0][0] = (float)inv_row[0][0];
	inv_mat[0][1] = (float)inv_row[0][1];
	inv_mat[0][2] = (float)inv_row[0][2];
	inv_mat[0][3] = (float)inv_row[0][3];

	inv_mat[1][0] = (float)inv_row[1][0];
	inv_mat[1][1] = (float)inv_row[1][1];
	inv_mat[1][2] = (float)inv_row[1][2];
	inv_mat[1][3] = (float)inv_row[1][3];

	inv_mat[2][0] = (float)inv_row[2][0];
	inv_mat[2][1] = (float)inv_row[2][1];
	inv_mat[2][2] = (float)inv_row[2][2];
	inv_mat[2][3] = (float)inv_row[2][3];

	inv_mat[3][0] = (float)inv_row[3][0];
	inv_mat[3][1] = (float)inv_row[3][1];
	inv_mat[3][2] = (float)inv_row[3][2];
	inv_mat[3][3] = (float)inv_row[3][3];

	return 1;
}
