// xgui 0.0.4 / 2002-08-07
//	mmx.cpp
//
//	http://606u.dir.bg/
//	606u@dir.bg

#include "_p.h"

#include "mmx.h"


namespace xgui
{


// prerequisites: input and output should be qword-aligned and with length dividible by 8
inline void convert_rgb_to_palette_mmx (
	IN const COLORREF *input,
	IN const COLORREF *palette,
	IN unsigned int samples,
	OUT COLORREF *output)
{
//	int j = i + 8;
//	j = (j - (j >> 4)) & 0xF0;

	__asm {
// note: it is not required to preserve the registers (eax, ebx, ecx, edx and edi)
		mov		ecx, samples
		shr		ecx, 1			// ecx = ecx / 2 (each iteration does 2 doublewords)

		mov		eax, input		// eax = &source
		mov		edx, output		// edx = &destination
		mov		edi, palette	// edi = &look-up table

		movq	mm3, add_qword	// mm3 will be the add quad-word
		movq	mm4, cut_hi_half_byte_mask	// mm4 will be the cut high-byte mask
		movq	mm5, mask_00000F
		movq	mm6, mask_0000F0
		movq	mm7, mask_000F00

		jmp		begin

//	static const unsigned long add_qword [2] = { 0x00080808, 0x00080808 };
add_qword:
		_emit	0x08
		_emit	0x08
		_emit	0x08
		_emit	0x00
		_emit	0x08
		_emit	0x08
		_emit	0x08
		_emit	0x00

//	static const unsigned long cut_hi_half_byte_mask [2] = { 0x000F0F0F, 0x000F0F0F };
cut_hi_half_byte_mask:
		_emit	0x0F
		_emit	0x0F
		_emit	0x0F
		_emit	0x00
		_emit	0x0F
		_emit	0x0F
		_emit	0x0F
		_emit	0x00

//	static const unsigned long mask_00000F [2] = { 0x0000003C, 0x0000003C };
mask_00000F:
		_emit	0x3C
		_emit	0x00
		_emit	0x00
		_emit	0x00
		_emit	0x3C
		_emit	0x00
		_emit	0x00
		_emit	0x00

//	static const unsigned long mask_0000F0 [2] = { 0x000003C0, 0x000003C0 };
mask_0000F0:
		_emit	0xC0
		_emit	0x03
		_emit	0x00
		_emit	0x00
		_emit	0xC0
		_emit	0x03
		_emit	0x00
		_emit	0x00

//	static const unsigned long mask_000F00 [2] = { 0x00003C00, 0x00003C00 };
mask_000F00:
		_emit	0x00
		_emit	0x3C
		_emit	0x00
		_emit	0x00
		_emit	0x00
		_emit	0x3C
		_emit	0x00
		_emit	0x00

begin:
		// begin loop
		movq	mm0, [eax]	// j = i / mm0 = source
		// here: mm0 = 0000 0000 RRRR RRRR GGGG GGGG BBBB BBBB x2 (binary)

		paddusb	mm0, mm3	// j = j + 8
		movq	mm1, mm0	// j* = j
		psrlq	mm1, 4		// j* >>= 4
		pand	mm1, mm4	// j* &= 0x0F
		psubusb	mm0, mm1	// j = j - j*
		// here: mm0 = j - (j >> 4)

		psrlq	mm0, 2		// j >>= 4
		// here: mm0 = 0000 0000 00RR RR00 00GG GG00 00BB BB00 x2 (binary)

		// note: following code is intentionally interleaved
		movq	mm1, mm0	// j(2) = j
		movq	mm2, mm0	// j(3) = j
		psrlq	mm1, 4		// j(2) >>= 4
		psrlq	mm2, 8		// j(3) >>= 8
		pand	mm1, mm6	// mask j(2)
		pand	mm2, mm7	// mask j(3)
		pand	mm0, mm5	// mask j(1)
		// here: mm0 = 0000 0000 0000 0000 0000 0000 00BB BB00 x2 (binary)
		// here: mm1 = 0000 0000 0000 0000 0000 00GG GG00 0000 x2 (binary)
		// here: mm2 = 0000 0000 0000 0000 00RR RR00 0000 0000 x2 (binary)

		por		mm0, mm1	// combine j(1), j(2) and j(3)
		por		mm0, mm2
		// here: mm0 = 0000 0000 0000 0000 00RR RRGG GGBB BB00 x2 (binary)
		// index is multiplied by 4, because LUT contains doublewords (4 bytes each)

		movd	ebx, mm0			// bx = lo-dword (mm0) = lut index for lo-dword color
		psrlq	mm0, 32				// mm0 >>= 32
		movd	mm1, [edi + ebx]	// mm1 = color for lo-dword
		movd	ebx, mm0			// bx = hi-dword = lut index for hi-dword color
		movd	mm0, [edi + ebx]	// mm0 = color for hi-dword
		psllq	mm0, 32				// mm0 <<= 32
		por		mm0, mm1			// mm0 = both colors

		movq	[edx], mm0	// destination = mm0 (result)

		add		edx, 8		// advance destination (++&j)
		add		eax, 8		// ++(&i) / advance source

		dec		ecx
		jnz		begin
		// end loop

		emms				// end MMX
	}
}


};	// xgui namespace