/* -----------------------------------------------------------------------
 * 
 * umac.c -- C Implementation UMAC Message Authentication
 *
 * Version 0.04 of draft-krovetz-umac-00.txt -- 2000 August
 *
 * For a full description of UMAC message authentication see the UMAC
 * world-wide-web page at http://www.cs.ucdavis.edu/~rogaway/umac
 * Please report bugs and suggestions to the UMAC webpage.
 *
 * Copyright (c) 1999-2000 Ted Krovetz (tdk@acm.org)
 *                                                                 
 * Permission to use, copy, modify, and distribute this software and  
 * its documentation for any purpose and without fee, is hereby granted,
 * provided that the above copyright notice appears in all copies and  
 * that both that copyright notice and this permission notice appear   
 * in supporting documentation, and that the names of the University of
 * California and Ted Krovetz not be used in advertising or publicity  
 * pertaining to distribution of the software without specific,        
 * written prior permission.                                          
 *                                                                   
 * The Regents of the University of California and Ted Krovetz disclaim 
 * all warranties with regard to this software, including all implied
 * warranties of merchantability and fitness.  In no event shall the  
 * University of California or Ted Krovetz be liable for any special,  
 * indirect or consequential damages or any damages whatsoever resulting
 * from loss of use, data or profits, whether in an action of contract,
 * negligence or other tortious action, arising out of or in connection
 * with the use or performance of this software.
 * 
 * ---------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
 * Rules for writing an architecture specific include file
 *
 * - For any "class" of functions written here (eg. ARCH_ROTL), all
 *   functions in the class must be written here.
 * - For each "class" written, define the class macro as 1
 *   (eg. #define ARCH_ROTL 1).
 * - This file is included because we are using extensions to ANSI C,
 *   but you must distinguish between "intrinsic" and "intrinsic+asm"
 *   extensions. This is easily done by writing this file in three
 *   sections: (1) intrinsic only functions, (2) intrinsic functions for
 *   which there exists an assembly equivalent and (3) assembly functions.
 *   If we are to do "intrinsic" extensions, then (1) and (2) should be
 *   compiled, otherwise if we are "intrinsic+asm", then (1) and (3).
 *   The assumption is that for speed, C < C+intrinsics < C+assembly.
 * ---------------------------------------------------------------------- */

#define SSE2               0  /* Streaming SIMD 2 available on P4         */
#define INTEL_INTRINSICS   0  /* Intel's MMX and SSE intrinsics           */

#pragma warning(disable: 4731)  /* Turn off "ebp manipulation" warning    */

/* ---------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- */
/* First define routines which are only written using compiler intrinsics */
/* ---------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- */


/* ---------------------------------------------------------------------- */
#define ARCH_ROTL  1
#if     ARCH_ROTL
/* ---------------------------------------------------------------------- */
#define ROTL32_VAR(r,n)   _rotl(r,n)
#define ROTL32_CONST(r,n) _rotl(r,n)
/* ---------------------------------------------------------------------- */
#endif /* ARCH_ROTL */
/* ---------------------------------------------------------------------- */


/* ----------------------------------------------------------------------
 * ----------------------------------------------------------------------
 * ----------------------------------------------------------------------
 * Second define routines which are written using compiler intrinsics but
 * which have assembly equivalents in the third section.
 * ----------------------------------------------------------------------
 * ---------------------------------------------------------------------- */
#if ( ! USE_C_AND_ASSEMBLY)  /* Intrinsics only allowed */
/* ---------------------------------------------------------------------- */


/* ---------------------------------------------------------------------- */
#define ARCH_ENDIAN_LS  1
#if     ARCH_ENDIAN_LS
/* ---------------------------------------------------------------------- */

static UINT32 LOAD_UINT32_REVERSED(void *ptr)
{
    UINT32 temp = *(UINT32 *)ptr;
    temp = (ROTL32_CONST(temp,8 ) & 0x00FF00FF) |
           (ROTL32_CONST(temp,24) & 0xFF00FF00);
    return temp;    
}
               
static void STORE_UINT32_REVERSED(void *ptr, UINT32 x)
{
    UINT32 temp = x;
    temp = (ROTL32_CONST(temp,8 ) & 0x00FF00FF) |
           (ROTL32_CONST(temp,24) & 0xFF00FF00);
    *(UINT32 *)ptr = temp;  
}

static UINT16 LOAD_UINT16_REVERSED(void *ptr)
{
    UINT16 temp = *(UINT16 *)ptr;
    temp = (temp >> 8) | (temp << 8);
    return temp;
}
               
static void STORE_UINT16_REVERSED(void *ptr, UINT16 x)
{
    UINT16 temp = x;
    temp = (temp >> 8) | (temp << 8);
    *(UINT16 *)ptr = temp;  
}
               
/* ---------------------------------------------------------------------- */
#endif /* ARCH_ENDIAN_LS */
/* ---------------------------------------------------------------------- */


#if ((WORD_LEN == 2) && INTEL_INTRINSICS)

/* ---------------------------------------------------------------------- */
#define ARCH_NH16  1
#if     ARCH_NH16
/* ---------------------------------------------------------------------- */

#if (SSE2)  /* 128-bit vector registers */

#include "emmintrin.h"

#define NH_STEP(k0,k1,acc1) \
    m0 = _mm_add_epi16(k0,d0); \
    m1 = _mm_add_epi16(k1,d1); \
    m2 = _mm_madd_epi16(m0,m1); \
    acc1 = _mm_add_epi32(acc1, m2)

/* ---------------------------------------------------------------------- */

static void nh_aux_4(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 t[2];
    UINT32 *p = (UINT32 *)hp;
    UINT32 iters = dlen/32;

    __m128i *key = (__m128i *)kp;
    __m128i *data = (__m128i *)dp;
    __m128i acc;
    __m128i d0,d1,k0,k1,m0,m1,m2;
  
    acc = _mm_setzero_si128();

    do {
        k0 = _mm_load_si128(key + 0);
        k1 = _mm_load_si128(key + 1);
        d0 = _mm_load_si128(data + 0);
        d1 = _mm_load_si128(data + 1);
    
        NH_STEP(k0,k1,acc);

        key += 2;
        data += 2;
    } while (--iters);

    acc = _mm_add_epi32(acc, _mm_srli_si128(acc, 8));
    _mm_storel_epi64((__m128i *)t,acc);
    p[0] += t[0] + t[1];
}

/* ---------------------------------------------------------------------- */

static void nh_aux_8(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 t[4];
    UINT32 *p = (UINT32 *)hp;
    UINT32 iters = dlen/32;

    __m128i *key = (__m128i *)kp;
    __m128i *data = (__m128i *)dp;
    __m128i acc1,acc2;
    __m128i d0,d1,k0,k1,k2,m0,m1,m2;
  
    acc1 = _mm_setzero_si128();
    acc2 = _mm_setzero_si128();

    do {
        k0 = _mm_load_si128(key + 0);
        k1 = _mm_load_si128(key + 1);
        k2 = _mm_load_si128(key + 2);
        d0 = _mm_load_si128(data + 0);
        d1 = _mm_load_si128(data + 1);
    
        NH_STEP(k0,k1,acc1);
        NH_STEP(k1,k2,acc2);
    
        key += 2;
        data += 2;
    } while (--iters);

    acc1 = _mm_add_epi32(acc1, _mm_srli_si128(acc1, 8));
    _mm_storel_epi64((__m128i *)t,acc1);
    p[0] += t[0] + t[1];
    acc2 = _mm_add_epi32(acc2, _mm_srli_si128(acc2, 8));
    _mm_storel_epi64((__m128i *)(t+2),acc2);
    p[1] += t[2] + t[3];
}

/* ---------------------------------------------------------------------- */

static void nh_aux_16(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 t[8];
    UINT32 *p = (UINT32 *)hp;
    UINT32 iters = dlen/32;

    __m128i *key = (__m128i *)kp;
    __m128i *data = (__m128i *)dp;
    __m128i acc1,acc2,acc3,acc4;
    __m128i d0,d1,k0,k1,k2,k3,k4,m0,m1,m2;
  
    acc1 = _mm_setzero_si128();
    acc2 = _mm_setzero_si128();
    acc3 = _mm_setzero_si128();
    acc4 = _mm_setzero_si128();

    do {
        k0 = _mm_load_si128(key + 0);
        k1 = _mm_load_si128(key + 1);
        k2 = _mm_load_si128(key + 2);
        k3 = _mm_load_si128(key + 3);
        k4 = _mm_load_si128(key + 4);
        d0 = _mm_load_si128(data + 0);
        d1 = _mm_load_si128(data + 1);
    
        NH_STEP(k0,k1,acc1);
        NH_STEP(k1,k2,acc2);
        NH_STEP(k2,k3,acc3);
        NH_STEP(k3,k4,acc4);

        key += 2;
        data += 2;
    } while (--iters);

    acc1 = _mm_add_epi32(acc1, _mm_srli_si128(acc1, 8));
    _mm_storel_epi64((__m128i *)t,acc1);
    p[0] += t[0] + t[1];
    acc2 = _mm_add_epi32(acc2, _mm_srli_si128(acc2, 8));
    _mm_storel_epi64((__m128i *)(t+2),acc2);
    p[1] += t[2] + t[3];
    acc3 = _mm_add_epi32(acc3, _mm_srli_si128(acc3, 8));
    _mm_storel_epi64((__m128i *)(t+4),acc3);
    p[2] += t[4] + t[5];
    acc4 = _mm_add_epi32(acc4, _mm_srli_si128(acc4, 8));
    _mm_storel_epi64((__m128i *)(t+6),acc4);
    p[3] += t[6] + t[7];
}

/* ---------------------------------------------------------------------- */

#else /* No SSE2 */

#include <mmintrin.h>

/* ---------------------------------------------------------------------- */
static void nh_aux_4(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 *p = (UINT32 *)hp;
    UINT32 iters = dlen/32;
  
  __m64 *key = (__m64 *)kp;
  __m64 *data = (__m64 *)dp;
  __m64 acc = 0;
  
  do {

    acc = _m_paddd(acc, _m_pmaddwd(_m_paddw(*(key + 0), *(data + 0)), 
                                   _m_paddw(*(key + 2), *(data + 2))));
    acc = _m_paddd(acc, _m_pmaddwd(_m_paddw(*(key + 1), *(data + 1)), 
                                   _m_paddw(*(key + 3), *(data + 3))));

    key += 4;
    data += 4;
  } while (--iters);

    p[0] += _m_to_int(_m_paddd(acc, _m_psrlqi(acc, 32)));

    _m_empty();
}

static void nh_aux_8(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 *p = (UINT32 *)hp;
    UINT32 iters = dlen/32;
  
  __m64 *key = (__m64 *)kp;
  __m64 *data = (__m64 *)dp;
  __m64 acc1 = 0, acc2 = 0;
  
  do {
    acc1 = _m_paddd(acc1, _m_pmaddwd(_m_paddw(*(key + 0), *(data + 0)), 
                                     _m_paddw(*(key + 2), *(data + 2))));
    acc2 = _m_paddd(acc2, _m_pmaddwd(_m_paddw(*(key + 2), *(data + 0)), 
                                     _m_paddw(*(key + 4), *(data + 2))));

    acc1 = _m_paddd(acc1, _m_pmaddwd(_m_paddw(*(key + 1), *(data + 1)), 
                                     _m_paddw(*(key + 3), *(data + 3))));
    acc2 = _m_paddd(acc2, _m_pmaddwd(_m_paddw(*(key + 3), *(data + 1)), 
                                     _m_paddw(*(key + 5), *(data + 3))));

    key += 4;
    data += 4;
  } while (--iters);

    p[0] += _m_to_int(_m_paddd(acc1, _m_psrlqi(acc1, 32)));
    p[1] += _m_to_int(_m_paddd(acc2, _m_psrlqi(acc2, 32)));
        
    _m_empty();
}

static void nh_aux_16(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 *p = (UINT32 *)hp;
    UINT32 iters = dlen/32;
  
  __m64 *key = (__m64 *)kp;
  __m64 *data = (__m64 *)dp;
  __m64 acc1 = 0, acc2 = 0, acc3 = 0, acc4 = 0;
  
  do {
    acc1 = _m_paddd(acc1, _m_pmaddwd(_m_paddw(*(key + 0), *(data + 0)), 
                                     _m_paddw(*(key + 2), *(data + 2))));
    acc2 = _m_paddd(acc2, _m_pmaddwd(_m_paddw(*(key + 2), *(data + 0)), 
                                     _m_paddw(*(key + 4), *(data + 2))));
    acc3 = _m_paddd(acc3, _m_pmaddwd(_m_paddw(*(key + 4), *(data + 0)), 
                                     _m_paddw(*(key + 6), *(data + 2))));
    acc4 = _m_paddd(acc4, _m_pmaddwd(_m_paddw(*(key + 6), *(data + 0)), 
                                     _m_paddw(*(key + 8), *(data + 2))));

    acc1 = _m_paddd(acc1, _m_pmaddwd(_m_paddw(*(key + 1), *(data + 1)), 
                                     _m_paddw(*(key + 3), *(data + 3))));
    acc2 = _m_paddd(acc2, _m_pmaddwd(_m_paddw(*(key + 3), *(data + 1)), 
                                     _m_paddw(*(key + 5), *(data + 3))));
    acc3 = _m_paddd(acc3, _m_pmaddwd(_m_paddw(*(key + 5), *(data + 1)), 
                                     _m_paddw(*(key + 7), *(data + 3))));
    acc4 = _m_paddd(acc4, _m_pmaddwd(_m_paddw(*(key + 7), *(data + 1)), 
                                     _m_paddw(*(key + 9), *(data + 3))));

    key += 4;
    data += 4;
  } while (--iters);

    p[0] += _m_to_int(_m_paddd(acc1, _m_psrlqi(acc1, 32)));
    p[1] += _m_to_int(_m_paddd(acc2, _m_psrlqi(acc2, 32)));
    p[2] += _m_to_int(_m_paddd(acc3, _m_psrlqi(acc3, 32)));
    p[3] += _m_to_int(_m_paddd(acc4, _m_psrlqi(acc4, 32)));
    
    _m_empty();
}


#endif /* SSE2 */
#endif /* ARCH_NH16 */
#endif /* ((WORD_LEN == 2) && INTEL_INTRINSICS) */

#if ((WORD_LEN == 4) && INTEL_INTRINSICS && SSE2)

/* ---------------------------------------------------------------------- */
#define ARCH_NH32  1
#if     ARCH_NH32
/* ---------------------------------------------------------------------- */

#include "emmintrin.h"

#define NH_STEP(k1,k2,acc) \
    m1 = _mm_add_epi32(k1, d1); \
    m2 = _mm_add_epi32(k2, d2); \
    m3 = _mm_mul_epu32(m1, m2); \
    acc = _mm_add_epi64(acc, m3); \
    m1 = _mm_srli_si128(m1,4); \
    m2 = _mm_srli_si128(m2,4); \
    m3 = _mm_mul_epu32(m1, m2); \
    acc = _mm_add_epi64(acc, m3)


static void nh_aux_8(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT64 *p = (UINT64 *)hp;
    UINT32 len = dlen;
  
  __m128i *key = (__m128i *)kp;
  __m128i *data = (__m128i *)dp;
  __m128i acc = _mm_loadl_epi64((__m128i const*)p);
  __m128i m1,m2,m3,d1,d2,k1,k2;
  
  while (len >= 128) {
    d1 = _mm_load_si128(data);
    d2 = _mm_load_si128(data+1);
    k1 = _mm_load_si128(key);
    k2 = _mm_load_si128(key+1);
    NH_STEP(k1,k2,acc);
    d1 = _mm_load_si128(data+2);
    d2 = _mm_load_si128(data+3);
    k1 = _mm_load_si128(key+2);
    k2 = _mm_load_si128(key+3);
    NH_STEP(k1,k2,acc);
    d1 = _mm_load_si128(data+4);
    d2 = _mm_load_si128(data+5);
    k1 = _mm_load_si128(key+4);
    k2 = _mm_load_si128(key+5);
    NH_STEP(k1,k2,acc);
    d1 = _mm_load_si128(data+6);
    d2 = _mm_load_si128(data+7);
    k1 = _mm_load_si128(key+6);
    k2 = _mm_load_si128(key+7);
    NH_STEP(k1,k2,acc);
    key += 8;
    data += 8;
    len -= 128;
  }
  while (len >= 32) {
    d1 = _mm_load_si128(data);
    d2 = _mm_load_si128(data+1);
    k1 = _mm_load_si128(key);
    k2 = _mm_load_si128(key+1);
    NH_STEP(k1,k2,acc);
    key += 2;
    data += 2;
    len -= 32;
  }
    
    acc = _mm_add_epi64(acc, _mm_srli_si128(acc, 8));
    _mm_storel_epi64((__m128i *)(p),acc);
}


static void nh_aux_16(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT64 *p = (UINT64 *)hp;
    UINT32 len = dlen;
  
  __m128i *key = (__m128i *)kp;
  __m128i *data = (__m128i *)dp;
  __m128i acc = _mm_loadl_epi64((__m128i const*)p),
          acc2 = _mm_loadl_epi64((__m128i const*)(p+1));
  __m128i m1,m2,m3,d1,d2,k1,k2,k3;
  
  while (len >= 128) {
    d1 = _mm_load_si128(data);
    d2 = _mm_load_si128(data+1);
    k1 = _mm_load_si128(key);
    k2 = _mm_load_si128(key+1);
    k3 = _mm_load_si128(key+2);
    NH_STEP(k1,k2,acc);
    NH_STEP(k2,k3,acc2);
    d1 = _mm_load_si128(data+2);
    d2 = _mm_load_si128(data+3);
    k1 = _mm_load_si128(key+2);
    k2 = _mm_load_si128(key+3);
    k3 = _mm_load_si128(key+4);
    NH_STEP(k1,k2,acc);
    NH_STEP(k2,k3,acc2);
    d1 = _mm_load_si128(data+4);
    d2 = _mm_load_si128(data+5);
    k1 = _mm_load_si128(key+4);
    k2 = _mm_load_si128(key+5);
    k3 = _mm_load_si128(key+6);
    NH_STEP(k1,k2,acc);
    NH_STEP(k2,k3,acc2);
    d1 = _mm_load_si128(data+6);
    d2 = _mm_load_si128(data+7);
    k1 = _mm_load_si128(key+6);
    k2 = _mm_load_si128(key+7);
    k3 = _mm_load_si128(key+8);
    NH_STEP(k1,k2,acc);
    NH_STEP(k2,k3,acc2);
    key += 8;
    data += 8;
    len -= 128;
  }
  while (len >= 32) {
    d1 = _mm_load_si128(data);
    d2 = _mm_load_si128(data+1);
    k1 = _mm_load_si128(key);
    k2 = _mm_load_si128(key+1);
    k3 = _mm_load_si128(key+2);
    NH_STEP(k1,k2,acc);
    NH_STEP(k2,k3,acc2);
    key += 2;
    data += 2;
    len -= 32;
  }
    
    acc = _mm_add_epi64(acc, _mm_srli_si128(acc, 8));
    _mm_storel_epi64((__m128i *)(p),acc);

    acc2 = _mm_add_epi64(acc2, _mm_srli_si128(acc2, 8));
    _mm_storel_epi64((__m128i *)(p+1),acc2);
}


#endif /* ARCH_NH32 */
#endif /* ((WORD_LEN == 4) && INTEL_INTRINSICS && SSE2) */

/* ----------------------------------------------------------------------
 * ----------------------------------------------------------------------
 * ----------------------------------------------------------------------
 * Third define routines which are written using inline assembly.
 * ----------------------------------------------------------------------
 * ----------------------------------------------------------------------
 * ---------------------------------------------------------------------- */
#else /* (USE_C_AND_ASSEMBLY) */
/* ---------------------------------------------------------------------- */


/* ---------------------------------------------------------------------- */
#define ARCH_ENDIAN_LS  1
/* ---------------------------------------------------------------------- */

static UINT32 LOAD_UINT32_REVERSED(void *p)
{
    __asm {
        mov eax, p
        mov eax, [eax]
        bswap eax
    }
}

static void STORE_UINT32_REVERSED(void *p, UINT32 x)
{
    __asm {
        mov eax,x
        bswap eax
        mov ecx, p
        mov [ecx], eax
    }
}

static UINT16 LOAD_UINT16_REVERSED(void *p)
{
    __asm {
        mov eax, p
        mov ax, [eax]
        rol ax,8
    }
}

static void STORE_UINT16_REVERSED(void *p, UINT16 x)
{
    __asm {
        mov ax,x
        rol ax,8
        mov ecx, p
        mov [ecx], ax
    }
}

/* ---------------------------------------------------------------------- */
#define ARCH_RC6  1
/* ---------------------------------------------------------------------- */
#define RC6_BLOCK(a,b,c,d,n) \
    __asm lea eax, [b+b+1] \
    __asm imul eax,b \
    __asm rol eax, 5 \
    __asm lea ecx, [d+d+1] \
    __asm imul ecx,d \
    __asm rol ecx, 5 \
    __asm xor a,eax \
    __asm rol a,cl \
    __asm add a,n[esi] \
    __asm xor c,ecx \
    __asm mov ecx,eax \
    __asm rol c,cl \
    __asm add c,n+4[esi]

static void RC6(UINT32 S[], void *pt, void *ct)
{ 
    __asm {
        push ebp
        mov esi, S
        mov ecx, pt
        mov edi,   [ecx]
        mov ebx,  4[ecx]
        mov ebp,  8[ecx]
        mov edx, 12[ecx]
        add ebx,   [esi]
        add edx,  4[esi]
      
        RC6_BLOCK(edi,ebx,ebp,edx,8)
        RC6_BLOCK(ebx,ebp,edx,edi,16)
        RC6_BLOCK(ebp,edx,edi,ebx,24)
        RC6_BLOCK(edx,edi,ebx,ebp,32)
      
        RC6_BLOCK(edi,ebx,ebp,edx,40)
        RC6_BLOCK(ebx,ebp,edx,edi,48)
        RC6_BLOCK(ebp,edx,edi,ebx,56)
        RC6_BLOCK(edx,edi,ebx,ebp,64)
      
        RC6_BLOCK(edi,ebx,ebp,edx,72)
        RC6_BLOCK(ebx,ebp,edx,edi,80)
        RC6_BLOCK(ebp,edx,edi,ebx,88)
        RC6_BLOCK(edx,edi,ebx,ebp,96)
      
        RC6_BLOCK(edi,ebx,ebp,edx,104)
        RC6_BLOCK(ebx,ebp,edx,edi,112)
        RC6_BLOCK(ebp,edx,edi,ebx,120)
        RC6_BLOCK(edx,edi,ebx,ebp,128)
      
        RC6_BLOCK(edi,ebx,ebp,edx,136)
        RC6_BLOCK(ebx,ebp,edx,edi,144)
        RC6_BLOCK(ebp,edx,edi,ebx,152)
        RC6_BLOCK(edx,edi,ebx,ebp,160)
        
        mov eax, ebp
        pop ebp
        add edi, 168[esi]
        add eax, 172[esi]
        mov ecx, ct
        mov   [ecx], edi
        mov  4[ecx], ebx
        mov  8[ecx], eax
        mov 12[ecx], edx
    }
} 



#if (WORD_LEN == 4) 
/* ---------------------------------------------------------------------- */
#define ARCH_NH32  1
/* ---------------------------------------------------------------------- */

#if (SSE2)  /* 128-bit vector registers */

/* These macros uses movdqa which requires 16-byte aligned data
 * and key.
 */
#define NH_STEP_128(n) \
    __asm movdqa xmm2, n[ecx] \
    __asm movdqa xmm3, n+16[ecx] \
    __asm movdqa xmm0, n[eax] \
    __asm movdqa xmm1, n+16[eax] \
    __asm movdqa xmm4, xmm3 \
    __asm paddd xmm2, xmm0 \
    __asm paddd xmm3, xmm1 \
    __asm movdqa xmm5, xmm2 \
    __asm pmuludq xmm2, xmm3 \
    __asm psrldq xmm5, 4 \
    __asm psrldq xmm3, 4 \
    __asm paddq xmm6, xmm2 \
    __asm pmuludq xmm3, xmm5 \
    __asm movdqa xmm5, n+32[ecx] \
    __asm paddd xmm4, xmm0 \
    __asm paddq xmm6, xmm3 \
    __asm paddd xmm5, xmm1 \
    __asm movdqa xmm3, xmm4 \
    __asm pmuludq xmm4, xmm5 \
    __asm psrldq xmm5, 4 \
    __asm psrldq xmm3, 4 \
    __asm pmuludq xmm5, xmm3 \
    __asm paddq xmm7, xmm4 \
    __asm paddq xmm7, xmm5

#define NH_STEP_64(n) \
    __asm movdqa xmm2, n[ecx] \
    __asm movdqa xmm0, n[eax] \
    __asm movdqa xmm3, n+16[ecx] \
    __asm movdqa xmm1, n+16[eax] \
    __asm paddd xmm2, xmm0 \
    __asm paddd xmm3, xmm1 \
    __asm movdqa xmm5, xmm2 \
    __asm pmuludq xmm2, xmm3 \
    __asm psrldq xmm5, 4 \
    __asm psrldq xmm3, 4 \
    __asm paddq xmm6, xmm2 \
    __asm pmuludq xmm3, xmm5 \
    __asm paddq xmm6, xmm3


static void nh_aux_8(void *kp, void *dp, void *hp, UINT32 dlen)
{
    __asm{
        mov edx,dlen
        mov eax,hp
        sub edx, 128
        movq xmm6, [eax]
        mov eax, dp
        mov ecx, kp
        jb label2
label1:
        NH_STEP_64(0)
        NH_STEP_64(32)
        NH_STEP_64(64)
        NH_STEP_64(96)
        add eax, 128
        add ecx, 128
        sub edx, 128
        jnb label1
label2:
        add edx,128
        je label4
label3:
        NH_STEP_64(0)
        add eax, 32
        add ecx, 32
        sub edx, 32
        jne label3
label4:
        mov eax,hp
        movdqa xmm0,xmm6
        psrldq xmm0, 8
        paddq xmm6, xmm0
        movq [eax], xmm6
    }
}

static void nh_aux_16(void *kp, void *dp, void *hp, UINT32 dlen)
{
    __asm{
        mov edx,dlen
        mov eax,hp
        sub edx, 128
        movq xmm6, [eax]
        movq xmm7, 8[eax]
        mov eax, dp
        mov ecx, kp
        jb label2
label1:
        NH_STEP_128(0)
        NH_STEP_128(32)
        NH_STEP_128(64)
        NH_STEP_128(96)
        add eax, 128
        add ecx, 128
        sub edx, 128
        jnb label1
label2:
        add edx,128
        je label4
label3:
        NH_STEP_128(0)
        add eax, 32
        add ecx, 32
        sub edx, 32
        jne label3
label4:
        mov eax,hp
        movdqa xmm0,xmm6
        movdqa xmm1,xmm7
        psrldq xmm0, 8
        psrldq xmm1, 8
        paddq xmm6, xmm0
        paddq xmm7, xmm1
        movq [eax], xmm6
        movq 8[eax], xmm7
    }
}

#else /* no SSE2 */

#define NH_STEP(n) \
    __asm mov eax,n[ebx]   \
    __asm mov edx,n+16[ebx] \
    __asm add eax,n[ecx]   \
    __asm add edx,n+16[ecx] \
    __asm mul edx         \
    __asm add esi,eax      \
    __asm adc edi,edx


static void nh_aux_8(void *kp, void *dp, void *hp, UINT32 dlen)
{
  __asm{
      push ebp
      mov ecx,kp
      mov ebx,dp
      mov eax,hp
      mov ebp,dlen
      sub ebp,128
      mov esi,[eax]
      mov edi,4[eax]
      jb label2     /* if 0 */
label1:
    NH_STEP(0)
    NH_STEP(4)
    NH_STEP(8)
    NH_STEP(12)
    NH_STEP(32)
    NH_STEP(36)
    NH_STEP(40)
    NH_STEP(44)
    NH_STEP(64)
    NH_STEP(68)
    NH_STEP(72)
    NH_STEP(76)
    NH_STEP(96)
    NH_STEP(100)
    NH_STEP(104)
    NH_STEP(108)
      add ecx,128
      add ebx,128
      sub ebp,128
      jnb label1
label2:
      add ebp,128
      je label4
label3:
    NH_STEP(0)
    NH_STEP(4)
    NH_STEP(8)
    NH_STEP(12)
      add ecx,32
      add ebx,32
      sub ebp,32
      jne label3
label4:
      pop ebp
      mov eax,hp
      mov [eax],esi
      mov 4[eax],edi
   }
}

/* ---------------------------------------------------------------------- */

/* ---------------------------------------------------------------------- */


static void nh_aux_16(void *kp, void *dp, void *hp, UINT32 dlen)
{
    nh_aux_8(kp,dp,hp,dlen);
    nh_aux_8((UINT8 *)kp+16,dp,(UINT8 *)hp+8,dlen);
}

#endif
#endif


/* ---------------------------------------------------------------------- */
#if (WORD_LEN == 2)
/* ---------------------------------------------------------------------- */
#define ARCH_NH16  1
/* ---------------------------------------------------------------------- */

#if (SSE2)  /* 128-bit vector registers */

#define NH_STEP_128(n)  \
    __asm movdqa xmm0,n+0[eax] \
    __asm movdqa xmm1,n+16[eax] \
    __asm movdqa xmm2,n+0[ecx] \
    __asm movdqa xmm3,n+16[ecx] \
    __asm paddw xmm2,xmm0 \
    __asm paddw xmm3,xmm1 \
    __asm pmaddwd xmm2,xmm3 \
    __asm psubw xmm3,xmm1         \
    __asm paddd xmm4,xmm2 \
    __asm movdqa xmm2,n+32[ecx] \
    __asm paddw xmm3,xmm0 \
    __asm paddw xmm2,xmm1 \
    __asm pmaddwd xmm3,xmm2 \
    __asm psubw xmm2,xmm1         \
    __asm paddd xmm5,xmm3 \
    __asm movdqa xmm3,n+48[ecx] \
    __asm paddw xmm2,xmm0 \
    __asm paddw xmm3,xmm1 \
    __asm pmaddwd xmm2,xmm3 \
    __asm psubw xmm3,xmm1         \
    __asm paddd xmm6,xmm2 \
    __asm movdqa xmm2,n+64[ecx] \
    __asm paddw xmm3,xmm0 \
    __asm paddw xmm2,xmm1 \
    __asm pmaddwd xmm3,xmm2 \
    __asm paddd xmm7,xmm3

#define NH_STEP_64(n)          \
    __asm movdqa xmm0,n+0[eax]  \
    __asm movdqa xmm1,n+16[eax] \
    __asm movdqa xmm2,n+0[ecx]  \
    __asm movdqa xmm3,n+16[ecx] \
    __asm paddw xmm2,xmm0         \
    __asm paddw xmm3,xmm1         \
    __asm pmaddwd xmm2,xmm3       \
    __asm paddd xmm4,xmm2         \
    __asm psubw xmm3,xmm1         \
    __asm movdqa xmm2,n+32[ecx] \
    __asm paddw xmm3,xmm0         \
    __asm paddw xmm2,xmm1         \
    __asm pmaddwd xmm2,xmm3       \
    __asm paddd xmm5,xmm2

#define NH_STEP_32(n)     \
    __asm movdqa xmm0,n+0[ecx]     \
    __asm movdqa xmm1,n+16[ecx]    \
    __asm movdqa xmm2,n+0[eax]     \
    __asm movdqa xmm3,n+16[eax]    \
    __asm paddw xmm2,xmm0         \
    __asm paddw xmm3,xmm1         \
    __asm pmaddwd xmm2,xmm3       \
    __asm paddd xmm7,xmm2

static void nh_aux_4(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 t[2];
    UINT32 *p = (UINT32 *)hp;
  
  __asm{
      mov edx,dlen
      pxor xmm7, xmm7
      sub edx, 128
      mov eax, dp
      mov ecx, kp
      jb label2
label1:
        NH_STEP_32(0)
        NH_STEP_32(32)
        NH_STEP_32(64)
        NH_STEP_32(96)
        add eax, 128
        add ecx, 128
        sub edx, 128
        jnb label1
label2:
      add edx,128
      je label4
label3:
       NH_STEP_32(0)
        add eax, 32
        add ecx, 32
        sub edx, 32
        jne label3
label4:
      movdqa xmm0, xmm7
      psrldq xmm7, 8
      paddd xmm7, xmm0
      movq t, xmm7
  }

  p[0] += (t[0] + t[1]);
}


/* ---------------------------------------------------------------------- */

static void nh_aux_8(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 t[4];
    UINT32 *p = (UINT32 *)hp;
  
  __asm{
      mov edx,dlen
      pxor xmm4, xmm4
      pxor xmm5, xmm5
      sub edx, 128
      mov eax, dp
      mov ecx, kp
      jb label2
label1:
        NH_STEP_64(0)
        NH_STEP_64(32)
        NH_STEP_64(64)
        NH_STEP_64(96)
        add eax, 128
        add ecx, 128
        sub edx, 128
        jnb label1
label2:
      add edx,128
      je label4
label3:
       NH_STEP_64(0)
        add eax, 32
        add ecx, 32
        sub edx, 32
        jne label3
label4:
      movdqa xmm0, xmm4
      movdqa xmm1, xmm5
      psrldq xmm4, 8
      psrldq xmm5, 8
      paddd xmm0, xmm4
      paddd xmm1, xmm5
      movq t, xmm0
      movq t+8, xmm1
  }

  p[0] += (t[0] + t[1]);
  p[1] += (t[2] + t[3]);
}

/* ---------------------------------------------------------------------- */


/* ---------------------------------------------------------------------- */

 void nh_aux_16(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 t[8];
    UINT32 *p = (UINT32 *)hp;
  
  __asm{
      mov edx,dlen
      pxor xmm4, xmm4
      pxor xmm5, xmm5
      pxor xmm6, xmm6
      pxor xmm7, xmm7
      sub edx, 128
      mov eax, dp
      mov ecx, kp
      jb label2
label1:
        NH_STEP_128(0)
        NH_STEP_128(32)
        NH_STEP_128(64)
        NH_STEP_128(96)
        add eax, 128
        add ecx, 128
        sub edx, 128
        jnb label1
label2:
      add edx,128
      je label4
label3:
       NH_STEP_128(0)
        add eax, 32
        add ecx, 32
        sub edx, 32
        jne label3
label4:
      movdqa xmm0, xmm4
      movdqa xmm1, xmm5
      movdqa xmm2, xmm6
      movdqa xmm3, xmm7
      psrldq xmm4, 8
      psrldq xmm5, 8
      psrldq xmm6, 8
      psrldq xmm7, 8
      paddd xmm4, xmm0
      paddd xmm5, xmm1
      paddd xmm6, xmm2
      paddd xmm7, xmm3
      movq t   , xmm4
      movq t+ 8, xmm5
      movq t+16, xmm6
      movq t+24, xmm7
  }

  p[0] += (t[0] + t[1]);
  p[1] += (t[2] + t[3]);
  p[2] += (t[4] + t[5]);
  p[3] += (t[6] + t[7]);
}

#else /* no SSE */

#define NH_STEP_128(n)  \
    __asm movq mm0,n+0[eax] \
    __asm movq mm1,n+16[eax] \
    __asm movq mm2,n+0[ecx] \
    __asm movq mm3,n+16[ecx] \
    __asm paddw mm2,mm0 \
    __asm paddw mm3,mm1 \
    __asm pmaddwd mm2,mm3 \
    __asm paddd mm4,mm2 \
    __asm psubw mm3,mm1         \
    __asm movq mm2,n+32[ecx] \
    __asm paddw mm3,mm0 \
    __asm paddw mm2,mm1 \
    __asm pmaddwd mm3,mm2 \
    __asm paddd mm5,mm3 \
    __asm psubw mm2,mm1         \
    __asm movq mm3,n+48[ecx] \
    __asm paddw mm2,mm0 \
    __asm paddw mm3,mm1 \
    __asm pmaddwd mm2,mm3 \
    __asm paddd mm6,mm2 \
    __asm psubw mm3,mm1         \
    __asm movq mm2,n+64[ecx] \
    __asm paddw mm3,mm0 \
    __asm paddw mm2,mm1 \
    __asm pmaddwd mm3,mm2 \
    __asm paddd mm7,mm3 \
    __asm movq mm0,n+8[eax] \
    __asm movq mm1,n+24[eax] \
    __asm movq mm2,n+8[ecx] \
    __asm movq mm3,n+24[ecx] \
    __asm paddw mm2,mm0 \
    __asm paddw mm3,mm1 \
    __asm pmaddwd mm2,mm3 \
    __asm paddd mm4,mm2 \
    __asm psubw mm3,mm1         \
    __asm movq mm2,n+40[ecx] \
    __asm paddw mm3,mm0 \
    __asm paddw mm2,mm1 \
    __asm pmaddwd mm3,mm2 \
    __asm paddd mm5,mm3 \
    __asm psubw mm2,mm1         \
    __asm movq mm3,n+56[ecx] \
    __asm paddw mm2,mm0 \
    __asm paddw mm3,mm1 \
    __asm pmaddwd mm2,mm3 \
    __asm paddd mm6,mm2 \
    __asm psubw mm3,mm1         \
    __asm movq mm2,n+72[ecx] \
    __asm paddw mm3,mm0 \
    __asm paddw mm2,mm1 \
    __asm pmaddwd mm3,mm2 \
    __asm paddd mm7,mm3

#define NH_STEP_64(n)          \
    __asm movq mm0,n+0[eax]  \
    __asm movq mm1,n+16[eax] \
    __asm movq mm2,n+0[ecx]  \
    __asm movq mm3,n+16[ecx] \
    __asm paddw mm2,mm0         \
    __asm paddw mm3,mm1         \
    __asm pmaddwd mm2,mm3       \
    __asm paddd mm4,mm2         \
    __asm psubw mm3,mm1         \
    __asm movq mm2,n+32[ecx] \
    __asm paddw mm3,mm0         \
    __asm paddw mm2,mm1         \
    __asm pmaddwd mm2,mm3       \
    __asm paddd mm5,mm2         \
    __asm movq mm0,n+8[eax]  \
    __asm movq mm1,n+24[eax] \
    __asm movq mm2,n+8[ecx]  \
    __asm movq mm3,n+24[ecx] \
    __asm paddw mm2,mm0         \
    __asm paddw mm3,mm1         \
    __asm pmaddwd mm2,mm3       \
    __asm paddd mm4,mm2         \
    __asm psubw mm3,mm1         \
    __asm movq mm2,n+40[ecx] \
    __asm paddw mm3,mm0         \
    __asm paddw mm2,mm1         \
    __asm pmaddwd mm2,mm3       \
    __asm paddd mm5,mm2

#define NH_STEP_32(n)     \
    __asm movq mm0,n+0[ecx]     \
    __asm movq mm1,n+16[ecx]    \
    __asm movq mm2,n+0[eax]     \
    __asm movq mm3,n+16[eax]    \
    __asm paddw mm2,mm0         \
    __asm paddw mm3,mm1         \
    __asm movq mm4,n+8[ecx]     \
    __asm movq mm6,n+24[ecx]    \
    __asm pmaddwd mm2,mm3       \
    __asm movq mm5,n+8[eax]     \
    __asm movq mm3,n+24[eax]    \
    __asm paddd mm7,mm2         \
    __asm paddw mm5,mm4         \
    __asm paddw mm3,mm6         \
    __asm pmaddwd mm5,mm3       \
    __asm paddd mm7,mm5       

static void nh_aux_4(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 t[2];
    UINT32 *p = (UINT32 *)hp;
  
  __asm{
      mov edx,dlen
      pxor mm7, mm7
      sub edx, 128
      mov eax, dp
      mov ecx, kp
      jb label2
label1:
        NH_STEP_32(0)
        NH_STEP_32(32)
        NH_STEP_32(64)
        NH_STEP_32(96)
        add eax, 128
        add ecx, 128
        sub edx, 128
        jnb label1
label2:
      add edx,128
      je label4
label3:
       NH_STEP_32(0)
        add eax, 32
        add ecx, 32
        sub edx, 32
        jne label3
label4:
      movq t   , mm7
      emms
  }

  p[0] += (t[0] + t[1]);
}


/* ---------------------------------------------------------------------- */

static void nh_aux_8(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 t[4];
    UINT32 *p = (UINT32 *)hp;
  
  __asm{
      mov edx,dlen
      pxor mm4, mm4
      pxor mm5, mm5
      sub edx, 128
      mov eax, dp
      mov ecx, kp
      jb label2
label1:
        NH_STEP_64(0)
        NH_STEP_64(32)
        NH_STEP_64(64)
        NH_STEP_64(96)
        add eax, 128
        add ecx, 128
        sub edx, 128
        jnb label1
label2:
      add edx,128
      je label4
label3:
       NH_STEP_64(0)
        add eax, 32
        add ecx, 32
        sub edx, 32
        jne label3
label4:
      movq t   , mm4
      movq t+ 8, mm5
      emms
  }

  p[0] += (t[0] + t[1]);
  p[1] += (t[2] + t[3]);
}

/* ---------------------------------------------------------------------- */


/* ---------------------------------------------------------------------- */

 void nh_aux_16(void *kp, void *dp, void *hp, UINT32 dlen)
{
    UINT32 t[8];
    UINT32 *p = (UINT32 *)hp;
  
  __asm{
      mov edx,dlen
      pxor mm4, mm4
      pxor mm5, mm5
      pxor mm6, mm6
      pxor mm7, mm7
      sub edx, 128
      mov eax, dp
      mov ecx, kp
      jb label2
label1:
        NH_STEP_128(0)
        NH_STEP_128(32)
        NH_STEP_128(64)
        NH_STEP_128(96)
        add eax, 128
        add ecx, 128
        sub edx, 128
        jnb label1
label2:
      add edx,128
      je label4
label3:
       NH_STEP_128(0)
        add eax, 32
        add ecx, 32
        sub edx, 32
        jne label3
label4:
      movq t   , mm4
      movq t+ 8, mm5
      movq t+16, mm6
      movq t+24, mm7
      emms
  }

  p[0] += (t[0] + t[1]);
  p[1] += (t[2] + t[3]);
  p[2] += (t[4] + t[5]);
  p[3] += (t[6] + t[7]);
}


#endif
#endif


#if (WORD_LEN == 4)
/* ---------------------------------------------------------------------- */
#define ARCH_IP  1
/* ---------------------------------------------------------------------- */

static UINT64 ip_aux(UINT64 t, UINT64 *ipkp, UINT64 data)
{
    UINT32 data_hi = (UINT32)(data >> 32),
           data_lo = (UINT32)(data),
           t_hi = (UINT32)(t >> 32),
           t_lo = (UINT32)(t);
    __asm{
        mov edi, ipkp
        mov ebx,data_hi
        mov ecx,data_lo
        mov esi, t_lo
        mov edx, t_hi
        push ebp
        mov ebp,edx
        mov eax,ebx
        shr eax,16
        mul DWORD PTR 0[edi]
        add esi,eax
        adc ebp,edx
        mov eax,ebx
        shr eax,16
        mul DWORD PTR 4[edi]
        add ebp,eax

        movzx eax,bx
        mul DWORD PTR 8[edi]
        add esi,eax
        adc ebp,edx
        movzx eax,bx
        mul DWORD PTR 12[edi]
        add ebp,eax

        mov eax,ecx
        shr eax,16
        mul DWORD PTR 16[edi]
        add esi,eax
        adc ebp,edx
        mov eax,ecx
        shr eax,16
        mul DWORD PTR 20[edi]
        add ebp,eax

        movzx eax,cx
        mul DWORD PTR 24[edi]
        add esi,eax
        adc ebp,edx
        movzx eax,cx
        mul DWORD PTR 28[edi]
        lea edx,[eax+ebp]
        mov eax,esi
        pop ebp
        /* MSVC returns UINT64 in edx:eax */
    }
}

static UINT32 ip_reduce_p36(UINT64 t)
{
    UINT32 t_hi = (UINT32)(t >> 32),
           t_lo = (UINT32)(t);
    __asm{
        mov edx,t_hi
        mov eax,t_lo
        mov edi,edx
        and edx,15
        shr edi,4
        lea edi,[edi+edi*4]
        add eax,edi
        adc edx,0
        cmp edx,0xf
        jb skip_sub
        ja do_sub
        cmp eax,0xfffffffb
        jb skip_sub
do_sub:
        sub eax, 0xfffffffb
        /* sbb  edx, 0xf We don't return the high word */
skip_sub:
    }
}

#else  /* WORD_LEN == 2 */

/* ---------------------------------------------------------------------- */
#define ARCH_IP  1
/* ---------------------------------------------------------------------- */

static UINT64 ip_aux(UINT64 t, UINT32 *ipkp, UINT32 data)
{
    UINT32 t_hi = (UINT32)(t >> 32),
           t_lo = (UINT32)(t);
    __asm{
        mov ecx, data
        mov ebx, ipkp
        mov esi,t_lo
        mov edi,t_hi
        mov eax,ecx
        shr eax,16
        mul DWORD PTR [ebx]
        add esi,eax
        adc edi,edx
        movzx eax,cx
        mul DWORD PTR 4[ebx]
        add eax,esi
        adc edx,edi
        /* MSVC returns UINT64 in edx:eax */
    }
}

 UINT16 ip_reduce_p19(UINT64 t)
{
    UINT32 t_hi = (UINT32)(t >> 32),
           t_lo = (UINT32)(t);
    __asm{
        mov edx,t_hi
        mov eax,t_lo
        shld edx,eax,13
        and eax,0x7ffff
        add eax,edx
        mov edx,eax
        sub eax,0x7ffff
        cmovc eax,edx
    }
}

#endif
#endif


